Merge remote-tracking branch 'unstable/master'
This commit is contained in:
commit
1d18ee26cc
133
build.xml
133
build.xml
|
|
@ -185,10 +185,7 @@
|
|||
<include name="**/*.class"/>
|
||||
</fileset>
|
||||
|
||||
<patternset id="dependency.mask" includes="*.jar">
|
||||
<exclude name="testng*.jar" />
|
||||
<exclude name="bcel*.jar" />
|
||||
</patternset>
|
||||
<patternset id="dependency.mask" includes="*.jar" />
|
||||
|
||||
<path id="external.dependencies">
|
||||
<fileset dir="${lib.dir}" erroronmissingdir="false">
|
||||
|
|
@ -205,6 +202,16 @@
|
|||
<pathelement location="${scala.classes}" />
|
||||
</path>
|
||||
|
||||
<path id="build.results">
|
||||
<!-- Ensure that GenomeAnalysisTK.jar comes first in the path, as it contains overrides for certain classes in our dependencies -->
|
||||
<pathelement location="${dist.dir}/GenomeAnalysisTK.jar" />
|
||||
<!-- After GenomeAnalysisTK.jar we include all of the other jars in the dist directory -->
|
||||
<fileset dir="${dist.dir}" erroronmissingdir="false">
|
||||
<patternset refid="dependency.mask" />
|
||||
<exclude name="GenomeAnalysisTK.jar" />
|
||||
</fileset>
|
||||
</path>
|
||||
|
||||
<fileset id="external.source.files" dir="${external.dir}" erroronmissingdir="false">
|
||||
<include name="**/*.java" />
|
||||
</fileset>
|
||||
|
|
@ -226,20 +233,20 @@
|
|||
<!-- the path for resources that need to go into the GATK jar;
|
||||
any additional resources should go into this set. -->
|
||||
<path id="gatk.resources">
|
||||
<fileset dir="${basedir}">
|
||||
<include name="${java.public.source.dir}/**/templates/*" />
|
||||
<include name="${java.private.source.dir}/**/templates/*" if="include.private" />
|
||||
<include name="${java.protected.source.dir}/**/templates/*" if="include.protected" />
|
||||
<fileset dir="${java.public.source.dir}">
|
||||
<include name="**/resources/*" />
|
||||
<include name="**/templates/*" />
|
||||
</fileset>
|
||||
<fileset dir="${java.private.source.dir}" erroronmissingdir="false">
|
||||
<include name="**/resources/*" if="include.private" />
|
||||
<include name="**/templates/*" if="include.private" />
|
||||
</fileset>
|
||||
<fileset dir="${java.protected.source.dir}" erroronmissingdir="false">
|
||||
<include name="**/resources/*" if="include.protected" />
|
||||
<include name="**/templates/*" if="include.protected" />
|
||||
</fileset>
|
||||
</path>
|
||||
|
||||
<path id="build.results">
|
||||
<fileset dir="${dist.dir}">
|
||||
<patternset refid="dependency.mask" />
|
||||
</fileset>
|
||||
</path>
|
||||
|
||||
|
||||
<!-- ******************************************************************************** -->
|
||||
<!-- Ivy Retrieve -->
|
||||
<!-- ******************************************************************************** -->
|
||||
|
|
@ -327,14 +334,18 @@
|
|||
|
||||
|
||||
<!-- INIT OVERRIDES: call these targets BEFORE init to override build defaults -->
|
||||
<target name="init.publiconly">
|
||||
<target name="init.build.publiconly">
|
||||
<property name="build.target" value="public" />
|
||||
</target>
|
||||
|
||||
<target name="init.publicprotectedonly">
|
||||
<target name="init.build.publicprotectedonly">
|
||||
<property name="build.target" value="protected" />
|
||||
</target>
|
||||
|
||||
<target name="init.build.all">
|
||||
<property name="build.target" value="all" />
|
||||
</target>
|
||||
|
||||
<target name="init.javaonly">
|
||||
<property name="compile.scala" value="false" />
|
||||
</target>
|
||||
|
|
@ -668,6 +679,24 @@
|
|||
</jar>
|
||||
</target>
|
||||
|
||||
<target name="na12878kb.jar" depends="gatk.compile,init.jar">
|
||||
<jar jarfile="${dist.dir}/na12878kb.jar">
|
||||
<fileset dir="${java.classes}">
|
||||
<include name="org/broadinstitute/sting/utils/GenomeLocParser*.class"/>
|
||||
<include name="org/broadinstitute/sting/utils/GenomeLoc.class"/>
|
||||
<include name="org/broadinstitute/sting/utils/HasGenomeLocation.class"/>
|
||||
<include name="org/broadinstitute/sting/utils/BaseUtils.class"/>
|
||||
<include name="org/broadinstitute/sting/utils/Utils.class"/>
|
||||
<include name="org/broadinstitute/sting/utils/exceptions/**/*.class"/>
|
||||
<include name="org/broadinstitute/sting/gatk/walkers/na12878kb/core/**/*.class"/>
|
||||
<include name="net/sf/picard/reference/FastaSequenceFile.class"/>
|
||||
</fileset>
|
||||
<fileset dir="${java.private.source.dir}">
|
||||
<include name="org/broadinstitute/sting/gatk/walkers/na12878kb/core/resources/**/*"/>
|
||||
</fileset>
|
||||
</jar>
|
||||
</target>
|
||||
|
||||
<target name="gatk.jar" depends="gatk.compile, init.jar, R.script.stage" description="generate the GATK distribution">
|
||||
<jar jarfile="${dist.dir}/GenomeAnalysisTK.jar">
|
||||
<path refid="gatk.resources"/>
|
||||
|
|
@ -842,19 +871,23 @@
|
|||
<!-- Release-related tasks -->
|
||||
<!-- ******************************************************************************** -->
|
||||
|
||||
<target name="init.buildgatkfull" depends="init.publicprotectedonly, init.javaonly">
|
||||
<target name="init.executable.gatkfull" depends="init.build.publicprotectedonly, init.javaonly">
|
||||
<property name="executable" value="GenomeAnalysisTK" />
|
||||
</target>
|
||||
|
||||
<target name="init.buildgatklite" depends="init.publiconly, init.javaonly">
|
||||
<target name="init.executable.gatklite" depends="init.build.publiconly, init.javaonly">
|
||||
<property name="executable" value="GenomeAnalysisTKLite" />
|
||||
</target>
|
||||
|
||||
<target name="init.buildqueuefull" depends="init.publicprotectedonly, init.javaandscala">
|
||||
<target name="init.executable.queueall" depends="init.build.all, init.javaandscala">
|
||||
<property name="executable" value="Queue" />
|
||||
</target>
|
||||
|
||||
<target name="init.buildqueuelite" depends="init.publiconly, init.javaandscala">
|
||||
<target name="init.executable.queuefull" depends="init.build.publicprotectedonly, init.javaandscala">
|
||||
<property name="executable" value="Queue" />
|
||||
</target>
|
||||
|
||||
<target name="init.executable.queuelite" depends="init.build.publiconly, init.javaandscala">
|
||||
<property name="executable" value="QueueLite" />
|
||||
</target>
|
||||
|
||||
|
|
@ -906,13 +939,15 @@
|
|||
</target>
|
||||
|
||||
<!-- Package specific versions of the GATK/Queue. ALWAYS do an ant clean before invoking these! -->
|
||||
<target name="package.gatk.full" depends="init.buildgatkfull,package" />
|
||||
<target name="package.gatk.full" depends="init.executable.gatkfull,package" />
|
||||
|
||||
<target name="package.gatk.lite" depends="init.buildgatklite,package" />
|
||||
<target name="package.gatk.lite" depends="init.executable.gatklite,package" />
|
||||
|
||||
<target name="package.queue.full" depends="init.buildqueuefull,package" />
|
||||
<target name="package.queue.all" depends="init.executable.queueall,package" />
|
||||
|
||||
<target name="package.queue.lite" depends="init.buildqueuelite,package" />
|
||||
<target name="package.queue.full" depends="init.executable.queuefull,package" />
|
||||
|
||||
<target name="package.queue.lite" depends="init.executable.queuelite,package" />
|
||||
|
||||
|
||||
<!-- Release a build. Don't call this target directly. Call one of the specific release targets below -->
|
||||
|
|
@ -975,6 +1010,8 @@
|
|||
|
||||
<target name="mvninstall.gatk.lite" depends="package.gatk.lite,mvninstall" />
|
||||
|
||||
<target name="mvninstall.queue.all" depends="package.queue.all,mvninstall" />
|
||||
|
||||
<target name="mvninstall.queue.full" depends="package.queue.full,mvninstall" />
|
||||
|
||||
<target name="mvninstall.queue.lite" depends="package.queue.lite,mvninstall" />
|
||||
|
|
@ -1091,15 +1128,10 @@
|
|||
</path>
|
||||
|
||||
<path id="testng.default.classpath">
|
||||
<pathelement location="${java.classes}" />
|
||||
<pathelement location="${scala.classes}" />
|
||||
<path refid="build.results" />
|
||||
<pathelement location="${java.contracts.dir}" />
|
||||
<pathelement location="${java.test.classes}" />
|
||||
<pathelement location="${scala.test.classes}" />
|
||||
<pathelement location="${R.tar.dir}" />
|
||||
<path refid="R.script.source.path" />
|
||||
<pathelement location="${key.dir}" />
|
||||
<path refid="external.dependencies" />
|
||||
</path>
|
||||
|
||||
<!-- Test targets -->
|
||||
|
|
@ -1107,9 +1139,6 @@
|
|||
<target name="test.init.compile">
|
||||
<mkdir dir="${java.test.classes}"/>
|
||||
<mkdir dir="${scala.test.classes}"/>
|
||||
<antcall target="resolve">
|
||||
<param name="ivy.conf" value="test"/>
|
||||
</antcall>
|
||||
</target>
|
||||
|
||||
<target name="test.java.internal.compile" depends="dist,test.init.compile">
|
||||
|
|
@ -1117,10 +1146,8 @@
|
|||
<javac fork="true" memoryMaximumSize="512m" destdir="${java.test.classes}" debug="true" optimize="on" tempdir="${java.io.tmpdir}">
|
||||
<src refid="java.test.source.path" />
|
||||
<classpath>
|
||||
<path refid="external.dependencies" />
|
||||
<pathelement location="${java.classes}"/>
|
||||
<path refid="build.results" />
|
||||
<pathelement location="${java.contracts.dir}"/>
|
||||
<pathelement location="${testng.jar}"/>
|
||||
</classpath>
|
||||
<compilerarg value="-proc:none"/>
|
||||
</javac>
|
||||
|
|
@ -1131,11 +1158,9 @@
|
|||
<javac fork="true" memoryMaximumSize="512m" destdir="${java.test.classes}" debug="true" optimize="on" tempdir="${java.io.tmpdir}" srcdir="${external.dir}">
|
||||
<include name="*/test/**/*.java"/>
|
||||
<classpath>
|
||||
<path refid="external.dependencies" />
|
||||
<path refid="build.results" />
|
||||
<pathelement location="${java.test.classes}"/>
|
||||
<pathelement location="${java.classes}"/>
|
||||
<pathelement location="${java.contracts.dir}"/>
|
||||
<pathelement location="${testng.jar}"/>
|
||||
</classpath>
|
||||
<compilerarg value="-proc:none"/>
|
||||
</javac>
|
||||
|
|
@ -1148,9 +1173,8 @@
|
|||
<scalac fork="true" jvmargs="-Xmx512m" destdir="${scala.test.classes}" deprecation="yes" unchecked="yes">
|
||||
<src refid="scala.test.source.path" />
|
||||
<classpath>
|
||||
<path refid="scala.dependencies"/>
|
||||
<path refid="build.results"/>
|
||||
<pathelement location="${java.test.classes}"/>
|
||||
<pathelement location="${testng.jar}"/>
|
||||
</classpath>
|
||||
</scalac>
|
||||
</target>
|
||||
|
|
@ -1192,14 +1216,16 @@
|
|||
<echo message="" />
|
||||
<echo message="Sting: Running @{testtype} test cases!"/>
|
||||
|
||||
<!-- no test is allowed to run for more than 10 hours -->
|
||||
<taskdef resource="testngtasks" classpath="${testng.jar}"/>
|
||||
<testng outputDir="@{outputdir}"
|
||||
classpathref="${testng.classpath}"
|
||||
haltOnFailure="false" failureProperty="test.failure"
|
||||
verbose="2"
|
||||
timeout="36000000"
|
||||
workingDir="${basedir}"
|
||||
useDefaultListeners="false"
|
||||
listeners="org.testng.reporters.FailedReporter,org.testng.reporters.JUnitXMLReporter,org.broadinstitute.sting.StingTextReporter,org.uncommons.reportng.HTMLReporter">
|
||||
listeners="org.testng.reporters.FailedReporter,org.testng.reporters.JUnitXMLReporter,org.broadinstitute.sting.TestNGTestTransformer,org.broadinstitute.sting.StingTextReporter,org.uncommons.reportng.HTMLReporter">
|
||||
<jvmarg value="-Xmx${test.maxmemory}" />
|
||||
<jvmarg value="-ea" />
|
||||
<jvmarg value="-Djava.awt.headless=true" />
|
||||
|
|
@ -1355,14 +1381,13 @@
|
|||
|
||||
<!-- Fast test target that cuts major corners for speed. Requires that a full build has been done first. Java-only, single test class only -->
|
||||
<!-- Usage: ant fasttest -Dsingle=TestClass -->
|
||||
<target name="fasttest" depends="init.javaonly,init,test.init">
|
||||
<target name="fasttest" depends="init.javaonly,init">
|
||||
<condition property="not.clean">
|
||||
<and>
|
||||
<available file="${build.dir}" />
|
||||
<available file="${lib.dir}" />
|
||||
<available file="${dist.dir}" />
|
||||
<available file="${java.test.classes}" />
|
||||
<available file="${testng.jar}" />
|
||||
</and>
|
||||
</condition>
|
||||
<fail message="fasttest requires a NON-CLEAN working directory (INCLUDING test classes). Do a full test build using ant test.compile first." unless="not.clean" />
|
||||
|
|
@ -1380,13 +1405,27 @@
|
|||
<javac fork="true" memoryMaximumSize="512m" destdir="${java.test.classes}" debug="true" optimize="on" tempdir="${java.io.tmpdir}">
|
||||
<src refid="java.test.source.path" />
|
||||
<classpath>
|
||||
<path refid="external.dependencies" />
|
||||
<pathelement location="${java.classes}"/>
|
||||
<pathelement location="${testng.jar}"/>
|
||||
<path refid="external.dependencies" />
|
||||
</classpath>
|
||||
<compilerarg value="-proc:none"/>
|
||||
</javac>
|
||||
|
||||
<!-- fasttest uses the unpackaged class files in its test classpath to avoid having to rebuild the jars in dist/ -->
|
||||
<path id="testng.fasttest.classpath">
|
||||
<pathelement location="${java.classes}" />
|
||||
<pathelement location="${scala.classes}" />
|
||||
<pathelement location="${java.contracts.dir}" />
|
||||
<pathelement location="${java.test.classes}" />
|
||||
<pathelement location="${scala.test.classes}" />
|
||||
<pathelement location="${R.tar.dir}" />
|
||||
<path refid="R.script.source.path" />
|
||||
<pathelement location="${key.dir}" />
|
||||
<path refid="external.dependencies" />
|
||||
<path refid="java.source.path" /> <!-- Terrible hack to allow fasttest to see resource files stored in the source tree -->
|
||||
</path>
|
||||
<property name="testng.classpath" value="testng.fasttest.classpath" />
|
||||
|
||||
<run-test testtype="${single}" outputdir="${report}/${single}" runfailed="false"/>
|
||||
</target>
|
||||
</project>
|
||||
|
|
|
|||
11
ivy.xml
11
ivy.xml
|
|
@ -24,11 +24,8 @@
|
|||
|
||||
<ivy-module version="1.0">
|
||||
<info organisation="org.broadinstitute" module="Sting"/>
|
||||
<configurations defaultconfmapping="test->default">
|
||||
<configurations>
|
||||
<conf name="default" description="the core dependencies for the GATK"/>
|
||||
<conf name="test" extends="default" description="external dependencies used for testing and metrics"/>
|
||||
<conf name="scala" extends="default" description="the dependencies for scala"/>
|
||||
<conf name="queue" extends="scala" description="the dependencies for Queue"/>
|
||||
</configurations>
|
||||
<dependencies defaultconf="default">
|
||||
<dependency org="net.sf" name="sam" rev="latest.integration"/>
|
||||
|
|
@ -83,9 +80,9 @@
|
|||
<dependency org="org.scala-lang" name="scala-library" rev="2.9.2"/>
|
||||
|
||||
<!-- testing and evaluation dependencies -->
|
||||
<dependency org="org.testng" name="testng" rev="5.14.1" conf="test"/>
|
||||
<dependency org="org.uncommons" name="reportng" rev="1.1.2" conf="test"/>
|
||||
<dependency org="com.google.code.caliper" name="caliper" rev="1.0-SNAPSHOT" conf="test"/>
|
||||
<dependency org="org.testng" name="testng" rev="5.14.1"/>
|
||||
<dependency org="org.uncommons" name="reportng" rev="1.1.2"/>
|
||||
<dependency org="com.google.code.caliper" name="caliper" rev="1.0-SNAPSHOT"/>
|
||||
|
||||
<!-- Contracts for Java and dependencies -->
|
||||
<dependency org="com.google.code.cofoja" name="cofoja" rev="1.0-r139"/>
|
||||
|
|
|
|||
|
|
@ -56,18 +56,20 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
for ( int i = 0; i < 4; i++ )
|
||||
alleleStratifiedElements[i] = new ArrayList<PileupElement>();
|
||||
|
||||
// keep all of the reduced reads
|
||||
final ArrayList<PileupElement> reducedReadPileups = new ArrayList<PileupElement>();
|
||||
|
||||
// start by stratifying the reads by the alleles they represent at this position
|
||||
for( final PileupElement pe : pileup ) {
|
||||
// abort if we have a reduced read - we do not want to remove it!
|
||||
// we do not want to remove a reduced read
|
||||
if ( pe.getRead().isReducedRead() )
|
||||
return pileup;
|
||||
reducedReadPileups.add(pe);
|
||||
|
||||
final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
|
||||
if ( baseIndex != -1 )
|
||||
alleleStratifiedElements[baseIndex].add(pe);
|
||||
}
|
||||
|
||||
// Down-sample *each* allele by the contamination fraction applied to the entire pileup.
|
||||
// Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later.
|
||||
int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor
|
||||
final TreeSet<PileupElement> elementsToKeep = new TreeSet<PileupElement>(new Comparator<PileupElement>() {
|
||||
|
|
@ -77,13 +79,23 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName());
|
||||
}
|
||||
});
|
||||
elementsToKeep.addAll(reducedReadPileups);
|
||||
|
||||
// make a listing of allele counts
|
||||
final int[] alleleCounts = new int[4];
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
alleleCounts[i] = alleleStratifiedElements[i].size();
|
||||
|
||||
// do smart down-sampling
|
||||
final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove);
|
||||
|
||||
for ( int i = 0; i < 4; i++ ) {
|
||||
final ArrayList<PileupElement> alleleList = alleleStratifiedElements[i];
|
||||
if ( alleleList.size() <= numReadsToRemove )
|
||||
logAllElements(alleleList, log);
|
||||
// if we don't need to remove any reads, keep them all
|
||||
if ( alleleList.size() <= targetAlleleCounts[i] )
|
||||
elementsToKeep.addAll(alleleList);
|
||||
else
|
||||
elementsToKeep.addAll(downsampleElements(alleleList, numReadsToRemove, log));
|
||||
elementsToKeep.addAll(downsampleElements(alleleList, alleleList.size() - targetAlleleCounts[i], log));
|
||||
}
|
||||
|
||||
// clean up pointers so memory can be garbage collected if needed
|
||||
|
|
@ -93,6 +105,66 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>(elementsToKeep));
|
||||
}
|
||||
|
||||
private static int scoreAlleleCounts(final int[] alleleCounts) {
|
||||
if ( alleleCounts.length < 2 )
|
||||
return 0;
|
||||
|
||||
// sort the counts (in ascending order)
|
||||
final int[] alleleCountsCopy = alleleCounts.clone();
|
||||
Arrays.sort(alleleCountsCopy);
|
||||
|
||||
final int maxCount = alleleCountsCopy[alleleCounts.length - 1];
|
||||
final int nextBestCount = alleleCountsCopy[alleleCounts.length - 2];
|
||||
|
||||
int remainderCount = 0;
|
||||
for ( int i = 0; i < alleleCounts.length - 2; i++ )
|
||||
remainderCount += alleleCountsCopy[i];
|
||||
|
||||
// try to get the best score:
|
||||
// - in the het case the counts should be equal with nothing else
|
||||
// - in the hom case the non-max should be zero
|
||||
return Math.min(maxCount - nextBestCount + remainderCount, Math.abs(nextBestCount + remainderCount));
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes an allele biased version of the given pileup
|
||||
*
|
||||
* @param alleleCounts the original pileup
|
||||
* @param numReadsToRemove fraction of total reads to remove per allele
|
||||
* @return allele biased pileup
|
||||
*/
|
||||
protected static int[] runSmartDownsampling(final int[] alleleCounts, final int numReadsToRemove) {
|
||||
final int numAlleles = alleleCounts.length;
|
||||
|
||||
int maxScore = scoreAlleleCounts(alleleCounts);
|
||||
int[] alleleCountsOfMax = alleleCounts;
|
||||
|
||||
final int numReadsToRemovePerAllele = numReadsToRemove / 2;
|
||||
|
||||
for ( int i = 0; i < numAlleles; i++ ) {
|
||||
for ( int j = i; j < numAlleles; j++ ) {
|
||||
final int[] newCounts = alleleCounts.clone();
|
||||
|
||||
// split these cases so we don't lose on the floor (since we divided by 2)
|
||||
if ( i == j ) {
|
||||
newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemove);
|
||||
} else {
|
||||
newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemovePerAllele);
|
||||
newCounts[j] = Math.max(0, newCounts[j] - numReadsToRemovePerAllele);
|
||||
}
|
||||
|
||||
final int score = scoreAlleleCounts(newCounts);
|
||||
|
||||
if ( score < maxScore ) {
|
||||
maxScore = score;
|
||||
alleleCountsOfMax = newCounts;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return alleleCountsOfMax;
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs allele biased down-sampling on a pileup and computes the list of elements to keep
|
||||
*
|
||||
|
|
@ -102,7 +174,15 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
* @return the list of pileup elements TO KEEP
|
||||
*/
|
||||
private static List<PileupElement> downsampleElements(final ArrayList<PileupElement> elements, final int numElementsToRemove, final PrintStream log) {
|
||||
if ( numElementsToRemove == 0 )
|
||||
return elements;
|
||||
|
||||
final int pileupSize = elements.size();
|
||||
if ( numElementsToRemove == pileupSize ) {
|
||||
logAllElements(elements, log);
|
||||
return new ArrayList<PileupElement>(0);
|
||||
}
|
||||
|
||||
final BitSet itemsToRemove = new BitSet(pileupSize);
|
||||
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
|
||||
itemsToRemove.set(selectedIndex);
|
||||
|
|
@ -132,15 +212,25 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
for ( final List<GATKSAMRecord> reads : alleleReadMap.values() )
|
||||
totalReads += reads.size();
|
||||
|
||||
// Down-sample *each* allele by the contamination fraction applied to the entire pileup.
|
||||
int numReadsToRemove = (int)(totalReads * downsamplingFraction);
|
||||
final List<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(numReadsToRemove * alleleReadMap.size());
|
||||
for ( final List<GATKSAMRecord> reads : alleleReadMap.values() ) {
|
||||
if ( reads.size() <= numReadsToRemove ) {
|
||||
readsToRemove.addAll(reads);
|
||||
logAllReads(reads, log);
|
||||
} else {
|
||||
readsToRemove.addAll(downsampleReads(reads, numReadsToRemove, log));
|
||||
|
||||
// make a listing of allele counts
|
||||
final List<Allele> alleles = new ArrayList<Allele>(alleleReadMap.keySet());
|
||||
alleles.remove(Allele.NO_CALL); // ignore the no-call bin
|
||||
final int numAlleles = alleles.size();
|
||||
final int[] alleleCounts = new int[numAlleles];
|
||||
for ( int i = 0; i < numAlleles; i++ )
|
||||
alleleCounts[i] = alleleReadMap.get(alleles.get(i)).size();
|
||||
|
||||
// do smart down-sampling
|
||||
final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove);
|
||||
|
||||
final List<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(numReadsToRemove);
|
||||
for ( int i = 0; i < numAlleles; i++ ) {
|
||||
final List<GATKSAMRecord> alleleBin = alleleReadMap.get(alleles.get(i));
|
||||
|
||||
if ( alleleBin.size() > targetAlleleCounts[i] ) {
|
||||
readsToRemove.addAll(downsampleReads(alleleBin, alleleBin.size() - targetAlleleCounts[i], log));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -156,13 +246,22 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
* @return the list of pileup elements TO REMOVE
|
||||
*/
|
||||
private static List<GATKSAMRecord> downsampleReads(final List<GATKSAMRecord> reads, final int numElementsToRemove, final PrintStream log) {
|
||||
final ArrayList<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(numElementsToRemove);
|
||||
|
||||
if ( numElementsToRemove == 0 )
|
||||
return readsToRemove;
|
||||
|
||||
final int pileupSize = reads.size();
|
||||
if ( numElementsToRemove == pileupSize ) {
|
||||
logAllReads(reads, log);
|
||||
return reads;
|
||||
}
|
||||
|
||||
final BitSet itemsToRemove = new BitSet(pileupSize);
|
||||
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
|
||||
itemsToRemove.set(selectedIndex);
|
||||
}
|
||||
|
||||
ArrayList<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(pileupSize - numElementsToRemove);
|
||||
for ( int i = 0; i < pileupSize; i++ ) {
|
||||
if ( itemsToRemove.get(i) ) {
|
||||
final GATKSAMRecord read = reads.get(i);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,59 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLocComparator;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
* A stash of regions that must be kept uncompressed in all samples
|
||||
*
|
||||
* In general, these are regions that were kept uncompressed by a tumor sample and we want to force
|
||||
* all other samples (normals and/or tumors) to also keep these regions uncompressed
|
||||
*
|
||||
* User: carneiro
|
||||
* Date: 10/15/12
|
||||
* Time: 4:08 PM
|
||||
*/
|
||||
public class CompressionStash extends TreeSet<SimpleGenomeLoc> {
|
||||
public CompressionStash() {
|
||||
super(new GenomeLocComparator());
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a SimpleGenomeLoc to the stash and merges it with any overlapping (and contiguous) existing loc
|
||||
* in the stash.
|
||||
*
|
||||
* @param insertLoc the new loc to be inserted
|
||||
* @return true if the loc, or it's merged version, wasn't present in the list before.
|
||||
*/
|
||||
@Override
|
||||
public boolean add(SimpleGenomeLoc insertLoc) {
|
||||
TreeSet<SimpleGenomeLoc> removedLocs = new TreeSet<SimpleGenomeLoc>();
|
||||
for (SimpleGenomeLoc existingLoc : this) {
|
||||
if (existingLoc.isPast(insertLoc)) {
|
||||
break; // if we're past the loc we're done looking for overlaps.
|
||||
}
|
||||
if (existingLoc.equals(insertLoc)) {
|
||||
return false; // if this loc was already present in the stash, we don't need to insert it.
|
||||
}
|
||||
if (existingLoc.contiguousP(insertLoc)) {
|
||||
removedLocs.add(existingLoc); // list the original loc for merging
|
||||
}
|
||||
}
|
||||
for (SimpleGenomeLoc loc : removedLocs) {
|
||||
this.remove(loc); // remove all locs that will be merged
|
||||
}
|
||||
removedLocs.add(insertLoc); // add the new loc to the list of locs that will be merged
|
||||
return super.add(SimpleGenomeLoc.merge(removedLocs)); // merge them all into one loc and add to the stash
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean addAll(Collection<? extends SimpleGenomeLoc> locs) {
|
||||
boolean result = false;
|
||||
for (SimpleGenomeLoc loc : locs) {
|
||||
result |= this.add(loc);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
@ -3,13 +3,14 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
|||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.SortedSet;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/*
|
||||
|
|
@ -41,7 +42,7 @@ import java.util.TreeSet;
|
|||
*
|
||||
* @author depristo
|
||||
*/
|
||||
public class MultiSampleCompressor implements Compressor {
|
||||
public class MultiSampleCompressor {
|
||||
protected static final Logger logger = Logger.getLogger(MultiSampleCompressor.class);
|
||||
|
||||
protected Map<String, SingleSampleCompressor> compressorsPerSample = new HashMap<String, SingleSampleCompressor>();
|
||||
|
|
@ -63,21 +64,36 @@ public class MultiSampleCompressor implements Compressor {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterable<GATKSAMRecord> addAlignment(GATKSAMRecord read) {
|
||||
String sample = read.getReadGroup().getSample();
|
||||
SingleSampleCompressor compressor = compressorsPerSample.get(sample);
|
||||
public Set<GATKSAMRecord> addAlignment(GATKSAMRecord read) {
|
||||
String sampleName = read.getReadGroup().getSample();
|
||||
SingleSampleCompressor compressor = compressorsPerSample.get(sampleName);
|
||||
if ( compressor == null )
|
||||
throw new ReviewedStingException("No compressor for sample " + sample);
|
||||
return compressor.addAlignment(read);
|
||||
throw new ReviewedStingException("No compressor for sample " + sampleName);
|
||||
Pair<Set<GATKSAMRecord>, CompressionStash> readsAndStash = compressor.addAlignment(read);
|
||||
Set<GATKSAMRecord> reads = readsAndStash.getFirst();
|
||||
CompressionStash regions = readsAndStash.getSecond();
|
||||
|
||||
reads.addAll(closeVariantRegionsInAllSamples(regions));
|
||||
|
||||
return reads;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterable<GATKSAMRecord> close() {
|
||||
SortedSet<GATKSAMRecord> reads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
for ( SingleSampleCompressor comp : compressorsPerSample.values() )
|
||||
for ( GATKSAMRecord read : comp.close() )
|
||||
reads.add(read);
|
||||
public Set<GATKSAMRecord> close() {
|
||||
Set<GATKSAMRecord> reads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
for ( SingleSampleCompressor sample : compressorsPerSample.values() ) {
|
||||
Pair<Set<GATKSAMRecord>, CompressionStash> readsAndStash = sample.close();
|
||||
reads = readsAndStash.getFirst();
|
||||
}
|
||||
return reads;
|
||||
}
|
||||
|
||||
private Set<GATKSAMRecord> closeVariantRegionsInAllSamples(CompressionStash regions) {
|
||||
Set<GATKSAMRecord> reads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
if (!regions.isEmpty()) {
|
||||
for (SingleSampleCompressor sample : compressorsPerSample.values()) {
|
||||
reads.addAll(sample.closeVariantRegions(regions));
|
||||
}
|
||||
}
|
||||
return reads;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,6 +25,9 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import net.sf.samtools.SAMProgramRecord;
|
||||
import net.sf.samtools.util.SequenceUtil;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
|
|
@ -45,6 +48,7 @@ import org.broadinstitute.sting.utils.Utils;
|
|||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.sam.BySampleSAMFileWriter;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
|
|
@ -81,12 +85,13 @@ import java.util.*;
|
|||
*/
|
||||
|
||||
@DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} )
|
||||
@PartitionBy(PartitionType.INTERVAL)
|
||||
@PartitionBy(PartitionType.CONTIG)
|
||||
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class})
|
||||
public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceReadsStash> {
|
||||
|
||||
@Output
|
||||
private StingSAMFileWriter out;
|
||||
private StingSAMFileWriter out = null;
|
||||
private SAMFileWriter writerToUse = null;
|
||||
|
||||
/**
|
||||
* The number of bases to keep around mismatches (potential variation)
|
||||
|
|
@ -196,6 +201,10 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
@Argument(fullName = "contigs", shortName = "ctg", doc = "", required = false)
|
||||
private int nContigs = 2;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "nwayout", shortName = "nw", doc = "", required = false)
|
||||
private boolean nwayout = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "", shortName = "dl", doc = "", required = false)
|
||||
private int debugLevel = 0;
|
||||
|
|
@ -222,9 +231,12 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
HashMap<String, Long> readNameHash; // This hash will keep the name of the original read the new compressed name (a number).
|
||||
Long nextReadNumber = 1L; // The next number to use for the compressed read name.
|
||||
|
||||
CompressionStash compressionStash = new CompressionStash();
|
||||
|
||||
SortedSet<GenomeLoc> intervalList;
|
||||
|
||||
private static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag
|
||||
private static final String PROGRAM_FILENAME_EXTENSION = ".reduced.bam";
|
||||
|
||||
/**
|
||||
* Basic generic initialization of the readNameHash and the intervalList. Output initialization
|
||||
|
|
@ -240,10 +252,22 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
if (toolkit.getIntervals() != null)
|
||||
intervalList.addAll(toolkit.getIntervals());
|
||||
|
||||
if (!NO_PG_TAG)
|
||||
Utils.setupWriter(out, toolkit, false, true, this, PROGRAM_RECORD_NAME);
|
||||
else
|
||||
|
||||
final boolean preSorted = true;
|
||||
final boolean indexOnTheFly = true;
|
||||
final boolean keep_records = true;
|
||||
final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate;
|
||||
if (nwayout) {
|
||||
SAMProgramRecord programRecord = NO_PG_TAG ? null : Utils.createProgramRecord(toolkit, this, PROGRAM_RECORD_NAME);
|
||||
writerToUse = new BySampleSAMFileWriter(toolkit, PROGRAM_FILENAME_EXTENSION, sortOrder, preSorted, indexOnTheFly, NO_PG_TAG, programRecord, true);
|
||||
}
|
||||
else {
|
||||
writerToUse = out;
|
||||
out.setPresorted(false);
|
||||
if (!NO_PG_TAG) {
|
||||
Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, keep_records, this, PROGRAM_RECORD_NAME);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -276,7 +300,7 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
// Check if the read goes beyond the boundaries of the chromosome, and hard clip those boundaries.
|
||||
int chromosomeLength = ref.getGenomeLocParser().getContigInfo(read.getReferenceName()).getSequenceLength();
|
||||
if (read.getSoftStart() < 0)
|
||||
read = ReadClipper.hardClipByReadCoordinates(read, 0, -read.getSoftStart() - 1);
|
||||
read = ReadClipper.hardClipByReadCoordinates(read, 0, -read.getSoftStart());
|
||||
if (read.getSoftEnd() > chromosomeLength)
|
||||
read = ReadClipper.hardClipByReadCoordinates(read, chromosomeLength - read.getSoftStart() + 1, read.getReadLength() - 1);
|
||||
|
||||
|
|
@ -384,6 +408,9 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
// output any remaining reads in the compressor
|
||||
for (GATKSAMRecord read : stash.close())
|
||||
outputRead(read);
|
||||
|
||||
if (nwayout)
|
||||
writerToUse.close();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -552,7 +579,7 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
if (!DONT_COMPRESS_READ_NAMES)
|
||||
compressReadName(read);
|
||||
|
||||
out.addAlignment(read);
|
||||
writerToUse.addAlignment(read);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -1,8 +1,10 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
|
|
@ -10,7 +12,7 @@ import java.util.TreeSet;
|
|||
* @author carneiro, depristo
|
||||
* @version 3.0
|
||||
*/
|
||||
public class SingleSampleCompressor implements Compressor {
|
||||
public class SingleSampleCompressor {
|
||||
final private int contextSize;
|
||||
final private int downsampleCoverage;
|
||||
final private int minMappingQuality;
|
||||
|
|
@ -24,6 +26,7 @@ public class SingleSampleCompressor implements Compressor {
|
|||
private SlidingWindow slidingWindow;
|
||||
private int slidingWindowCounter;
|
||||
|
||||
public static Pair<Set<GATKSAMRecord>, CompressionStash> emptyPair = new Pair<Set<GATKSAMRecord>,CompressionStash>(new TreeSet<GATKSAMRecord>(), new CompressionStash());
|
||||
|
||||
public SingleSampleCompressor(final int contextSize,
|
||||
final int downsampleCoverage,
|
||||
|
|
@ -46,12 +49,9 @@ public class SingleSampleCompressor implements Compressor {
|
|||
this.allowPolyploidReduction = allowPolyploidReduction;
|
||||
}
|
||||
|
||||
/**
|
||||
* @{inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public Iterable<GATKSAMRecord> addAlignment( GATKSAMRecord read ) {
|
||||
TreeSet<GATKSAMRecord> result = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
public Pair<Set<GATKSAMRecord>, CompressionStash> addAlignment( GATKSAMRecord read ) {
|
||||
Set<GATKSAMRecord> reads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
CompressionStash stash = new CompressionStash();
|
||||
int readOriginalStart = read.getUnclippedStart();
|
||||
|
||||
// create a new window if:
|
||||
|
|
@ -60,7 +60,9 @@ public class SingleSampleCompressor implements Compressor {
|
|||
(readOriginalStart - contextSize > slidingWindow.getStopLocation()))) { // this read is too far away from the end of the current sliding window
|
||||
|
||||
// close the current sliding window
|
||||
result.addAll(slidingWindow.close());
|
||||
Pair<Set<GATKSAMRecord>, CompressionStash> readsAndStash = slidingWindow.close();
|
||||
reads = readsAndStash.getFirst();
|
||||
stash = readsAndStash.getSecond();
|
||||
slidingWindow = null; // so we create a new one on the next if
|
||||
}
|
||||
|
||||
|
|
@ -69,13 +71,16 @@ public class SingleSampleCompressor implements Compressor {
|
|||
slidingWindowCounter++;
|
||||
}
|
||||
|
||||
result.addAll(slidingWindow.addRead(read));
|
||||
return result;
|
||||
stash.addAll(slidingWindow.addRead(read));
|
||||
return new Pair<Set<GATKSAMRecord>, CompressionStash>(reads, stash);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterable<GATKSAMRecord> close() {
|
||||
return (slidingWindow != null) ? slidingWindow.close() : new TreeSet<GATKSAMRecord>();
|
||||
public Pair<Set<GATKSAMRecord>, CompressionStash> close() {
|
||||
return (slidingWindow != null) ? slidingWindow.close() : emptyPair;
|
||||
}
|
||||
|
||||
public Set<GATKSAMRecord> closeVariantRegions(CompressionStash regions) {
|
||||
return slidingWindow.closeVariantRegions(regions);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler;
|
|||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.recalibration.EventType;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
|
@ -57,6 +58,8 @@ public class SlidingWindow {
|
|||
|
||||
private boolean allowPolyploidReductionInGeneral;
|
||||
|
||||
private static CompressionStash emptyRegions = new CompressionStash();
|
||||
|
||||
/**
|
||||
* The types of synthetic reads to use in the finalizeAndAdd method
|
||||
*/
|
||||
|
|
@ -137,7 +140,7 @@ public class SlidingWindow {
|
|||
* @param read the read
|
||||
* @return a list of reads that have been finished by sliding the window.
|
||||
*/
|
||||
public List<GATKSAMRecord> addRead(GATKSAMRecord read) {
|
||||
public CompressionStash addRead(GATKSAMRecord read) {
|
||||
addToHeader(windowHeader, read); // update the window header counts
|
||||
readsInWindow.add(read); // add read to sliding reads
|
||||
return slideWindow(read.getUnclippedStart());
|
||||
|
|
@ -151,8 +154,9 @@ public class SlidingWindow {
|
|||
* @param variantSite boolean array with true marking variant regions
|
||||
* @return null if nothing is variant, start/stop if there is a complete variant region, start/-1 if there is an incomplete variant region.
|
||||
*/
|
||||
private Pair<Integer, Integer> getNextVariantRegion(int from, int to, boolean[] variantSite) {
|
||||
private SimpleGenomeLoc findNextVariantRegion(int from, int to, boolean[] variantSite, boolean forceClose) {
|
||||
boolean foundStart = false;
|
||||
final int windowHeaderStart = getStartLocation(windowHeader);
|
||||
int variantRegionStartIndex = 0;
|
||||
for (int i=from; i<to; i++) {
|
||||
if (variantSite[i] && !foundStart) {
|
||||
|
|
@ -160,10 +164,12 @@ public class SlidingWindow {
|
|||
foundStart = true;
|
||||
}
|
||||
else if(!variantSite[i] && foundStart) {
|
||||
return(new Pair<Integer, Integer>(variantRegionStartIndex, i-1));
|
||||
return(new SimpleGenomeLoc(contig, contigIndex, windowHeaderStart + variantRegionStartIndex, windowHeaderStart + i - 1, true));
|
||||
}
|
||||
}
|
||||
return (foundStart) ? new Pair<Integer, Integer>(variantRegionStartIndex, -1) : null;
|
||||
final int refStart = windowHeaderStart + variantRegionStartIndex;
|
||||
final int refStop = windowHeaderStart + to - 1;
|
||||
return (foundStart && forceClose) ? new SimpleGenomeLoc(contig, contigIndex, refStart, refStop, true) : null;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -172,25 +178,25 @@ public class SlidingWindow {
|
|||
* @param from beginning window header index of the search window (inclusive)
|
||||
* @param to end window header index of the search window (exclusive)
|
||||
* @param variantSite boolean array with true marking variant regions
|
||||
* @return a list with start/stops of variant regions following getNextVariantRegion description
|
||||
* @return a list with start/stops of variant regions following findNextVariantRegion description
|
||||
*/
|
||||
private List<Pair<Integer, Integer>> getAllVariantRegions(int from, int to, boolean[] variantSite) {
|
||||
List<Pair<Integer,Integer>> regions = new LinkedList<Pair<Integer, Integer>>();
|
||||
private CompressionStash findVariantRegions(int from, int to, boolean[] variantSite, boolean forceClose) {
|
||||
CompressionStash regions = new CompressionStash();
|
||||
int index = from;
|
||||
while(index < to) {
|
||||
Pair<Integer,Integer> result = getNextVariantRegion(index, to, variantSite);
|
||||
SimpleGenomeLoc result = findNextVariantRegion(index, to, variantSite, forceClose);
|
||||
if (result == null)
|
||||
break;
|
||||
|
||||
regions.add(result);
|
||||
if (result.getSecond() < 0)
|
||||
if (!result.isFinished())
|
||||
break;
|
||||
index = result.getSecond() + 1;
|
||||
|
||||
index = result.getStop() + 1;
|
||||
}
|
||||
return regions;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Determines if the window can be slid given the new incoming read.
|
||||
*
|
||||
|
|
@ -201,25 +207,24 @@ public class SlidingWindow {
|
|||
* @param incomingReadUnclippedStart the incoming read's start position. Must be the unclipped start!
|
||||
* @return all reads that have fallen to the left of the sliding window after the slide
|
||||
*/
|
||||
protected List<GATKSAMRecord> slideWindow(final int incomingReadUnclippedStart) {
|
||||
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
protected CompressionStash slideWindow(final int incomingReadUnclippedStart) {
|
||||
final int windowHeaderStartLocation = getStartLocation(windowHeader);
|
||||
CompressionStash regions = emptyRegions;
|
||||
boolean forceClose = true;
|
||||
|
||||
if (incomingReadUnclippedStart - contextSize > windowHeaderStartLocation) {
|
||||
markSites(incomingReadUnclippedStart);
|
||||
int readStartHeaderIndex = incomingReadUnclippedStart - windowHeaderStartLocation;
|
||||
int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive)
|
||||
|
||||
List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet());
|
||||
finalizedReads = closeVariantRegions(regions, false);
|
||||
|
||||
while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) {
|
||||
readsInWindow.pollFirst();
|
||||
}
|
||||
regions = findVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet(), !forceClose);
|
||||
}
|
||||
|
||||
return finalizedReads;
|
||||
while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) {
|
||||
readsInWindow.pollFirst();
|
||||
}
|
||||
|
||||
return regions;
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -601,9 +606,7 @@ public class SlidingWindow {
|
|||
toRemove.add(read);
|
||||
}
|
||||
}
|
||||
for (GATKSAMRecord read : toRemove) {
|
||||
readsInWindow.remove(read);
|
||||
}
|
||||
removeReadsFromWindow(toRemove);
|
||||
}
|
||||
return allReads;
|
||||
}
|
||||
|
|
@ -623,26 +626,27 @@ public class SlidingWindow {
|
|||
result.addAll(addToSyntheticReads(windowHeader, 0, stop, false));
|
||||
result.addAll(finalizeAndAdd(ConsensusType.BOTH));
|
||||
|
||||
return result; // finalized reads will be downsampled if necessary
|
||||
return result; // finalized reads will be downsampled if necessary
|
||||
}
|
||||
|
||||
|
||||
private List<GATKSAMRecord> closeVariantRegions(List<Pair<Integer, Integer>> regions, boolean forceClose) {
|
||||
List<GATKSAMRecord> allReads = new LinkedList<GATKSAMRecord>();
|
||||
public Set<GATKSAMRecord> closeVariantRegions(CompressionStash regions) {
|
||||
TreeSet<GATKSAMRecord> allReads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
if (!regions.isEmpty()) {
|
||||
int lastStop = -1;
|
||||
for (Pair<Integer, Integer> region : regions) {
|
||||
int start = region.getFirst();
|
||||
int stop = region.getSecond();
|
||||
if (stop < 0 && forceClose)
|
||||
stop = windowHeader.size() - 1;
|
||||
if (stop >= 0) {
|
||||
allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1));
|
||||
int windowHeaderStart = getStartLocation(windowHeader);
|
||||
|
||||
for (SimpleGenomeLoc region : regions) {
|
||||
if (region.isFinished() && region.getContig() == contig && region.getStart() >= windowHeaderStart && region.getStop() <= windowHeaderStart + windowHeader.size()) {
|
||||
int start = region.getStart() - windowHeaderStart;
|
||||
int stop = region.getStop() - windowHeaderStart;
|
||||
|
||||
allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); // todo -- add condition here dependent on dbSNP track
|
||||
lastStop = stop;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion)
|
||||
windowHeader.remove(); // todo -- can't believe java doesn't allow me to just do windowHeader = windowHeader.get(stop). Should be more efficient here!
|
||||
|
||||
for (int i = 0; i <= lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion)
|
||||
windowHeader.remove();
|
||||
}
|
||||
return allReads;
|
||||
}
|
||||
|
|
@ -676,23 +680,24 @@ public class SlidingWindow {
|
|||
*
|
||||
* @return All reads generated
|
||||
*/
|
||||
public List<GATKSAMRecord> close() {
|
||||
public Pair<Set<GATKSAMRecord>, CompressionStash> close() {
|
||||
// mark variant regions
|
||||
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
|
||||
Set<GATKSAMRecord> finalizedReads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
CompressionStash regions = new CompressionStash();
|
||||
boolean forceCloseUnfinishedRegions = true;
|
||||
|
||||
if (!windowHeader.isEmpty()) {
|
||||
markSites(getStopLocation(windowHeader) + 1);
|
||||
List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet());
|
||||
finalizedReads = closeVariantRegions(regions, true);
|
||||
regions = findVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet(), forceCloseUnfinishedRegions);
|
||||
finalizedReads = closeVariantRegions(regions);
|
||||
|
||||
if (!windowHeader.isEmpty()) {
|
||||
finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), false));
|
||||
finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return finalizedReads;
|
||||
return new Pair<Set<GATKSAMRecord>, CompressionStash>(finalizedReads, regions);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -797,9 +802,8 @@ public class SlidingWindow {
|
|||
hetReads.add(finalizeRunningConsensus());
|
||||
}
|
||||
|
||||
for (GATKSAMRecord read : toRemove) {
|
||||
readsInWindow.remove(read);
|
||||
}
|
||||
removeReadsFromWindow(toRemove);
|
||||
|
||||
return hetReads;
|
||||
}
|
||||
|
||||
|
|
@ -916,5 +920,11 @@ public class SlidingWindow {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void removeReadsFromWindow (List<GATKSAMRecord> readsToRemove) {
|
||||
for (GATKSAMRecord read : readsToRemove) {
|
||||
readsInWindow.remove(read);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -240,8 +240,8 @@ public class AFCalcPerformanceTest {
|
|||
if ( a.isNonReference() ) {
|
||||
final String warningmeMLE = call.originalCall.getAlleleCountAtMLE(a) != result.getAlleleCountAtMLE(a) ? " DANGER-MLE-DIFFERENT" : "";
|
||||
logger.info("\t\t MLE " + a + ": " + call.originalCall.getAlleleCountAtMLE(a) + " vs " + result.getAlleleCountAtMLE(a) + warningmeMLE);
|
||||
final String warningmePost = call.originalCall.getLog10PosteriorOfAFGt0ForAllele(a) == 0 && result.getLog10PosteriorOfAFGt0ForAllele(a) < -10 ? " DANGER-POSTERIORS-DIFFERENT" : "";
|
||||
logger.info("\t\t Posterior " + a + ": " + call.originalCall.getLog10PosteriorOfAFGt0ForAllele(a) + " vs " + result.getLog10PosteriorOfAFGt0ForAllele(a) + warningmePost);
|
||||
final String warningmePost = call.originalCall.getLog10PosteriorOfAFEq0ForAllele(a) == 0 && result.getLog10PosteriorOfAFEq0ForAllele(a) < -10 ? " DANGER-POSTERIORS-DIFFERENT" : "";
|
||||
logger.info("\t\t Posterior " + a + ": " + call.originalCall.getLog10PosteriorOfAFEq0ForAllele(a) + " vs " + result.getLog10PosteriorOfAFEq0ForAllele(a) + warningmePost);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -30,175 +30,44 @@ import com.google.java.contract.Requires;
|
|||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class GenotypingEngine {
|
||||
|
||||
private final boolean DEBUG;
|
||||
private final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE;
|
||||
private final static List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
|
||||
private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("<UNASSEMBLED_EVENT>", false);
|
||||
private final VariantAnnotatorEngine annotationEngine;
|
||||
|
||||
public GenotypingEngine( final boolean DEBUG, final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) {
|
||||
public GenotypingEngine( final boolean DEBUG, final VariantAnnotatorEngine annotationEngine ) {
|
||||
this.DEBUG = DEBUG;
|
||||
this.OUTPUT_FULL_HAPLOTYPE_SEQUENCE = OUTPUT_FULL_HAPLOTYPE_SEQUENCE;
|
||||
this.annotationEngine = annotationEngine;
|
||||
noCall.add(Allele.NO_CALL);
|
||||
}
|
||||
|
||||
// WARN
|
||||
// This function is the streamlined approach, currently not being used by default
|
||||
// WARN
|
||||
// WARN: This function is currently only being used by Menachem. Slated for removal/merging with the rest of the code.
|
||||
// WARN
|
||||
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
|
||||
public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine,
|
||||
final ArrayList<Haplotype> haplotypes,
|
||||
final byte[] ref,
|
||||
final GenomeLoc refLoc,
|
||||
final GenomeLoc activeRegionWindow,
|
||||
final GenomeLocParser genomeLocParser ) {
|
||||
// Prepare the list of haplotype indices to genotype
|
||||
final ArrayList<Allele> allelesToGenotype = new ArrayList<Allele>();
|
||||
public List<VariantContext> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine,
|
||||
final List<Haplotype> haplotypes,
|
||||
final List<String> samples,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
|
||||
final Map<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,
|
||||
final byte[] ref,
|
||||
final GenomeLoc refLoc,
|
||||
final GenomeLoc activeRegionWindow,
|
||||
final GenomeLocParser genomeLocParser,
|
||||
final List<VariantContext> activeAllelesToGenotype ) {
|
||||
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
allelesToGenotype.add( Allele.create(h.getBases(), h.isReference()) );
|
||||
}
|
||||
final int numHaplotypes = haplotypes.size();
|
||||
|
||||
// Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
|
||||
final GenotypesContext genotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size());
|
||||
for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
|
||||
final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2];
|
||||
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(haplotypes, sample);
|
||||
int glIndex = 0;
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC
|
||||
}
|
||||
}
|
||||
genotypes.add(new GenotypeBuilder(sample, noCall).PL(genotypeLikelihoods).make());
|
||||
}
|
||||
final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder().loc(activeRegionWindow).alleles(allelesToGenotype).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
|
||||
if( call == null ) { return Collections.emptyList(); } // exact model says that the call confidence is below the specified confidence threshold so nothing to do here
|
||||
|
||||
// Prepare the list of haplotypes that need to be run through Smith-Waterman for output to VCF
|
||||
final ArrayList<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
if( call.getAllele(h.getBases()) == null ) { // exact model removed this allele from the list so no need to run SW and output to VCF
|
||||
haplotypesToRemove.add(h);
|
||||
}
|
||||
}
|
||||
haplotypes.removeAll(haplotypesToRemove);
|
||||
|
||||
if( OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) {
|
||||
final List<Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>>> returnVCs = new ArrayList<Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>>>();
|
||||
// set up the default 1-to-1 haplotype mapping object
|
||||
final HashMap<Allele,ArrayList<Haplotype>> haplotypeMapping = new HashMap<Allele,ArrayList<Haplotype>>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
|
||||
list.add(h);
|
||||
haplotypeMapping.put(call.getAllele(h.getBases()), list);
|
||||
}
|
||||
returnVCs.add( new Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>>(call,haplotypeMapping) );
|
||||
return returnVCs;
|
||||
}
|
||||
|
||||
final ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> returnCalls = new ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>>();
|
||||
|
||||
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
|
||||
final TreeSet<Integer> startPosKeySet = new TreeSet<Integer>();
|
||||
int count = 0;
|
||||
if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); }
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
if( DEBUG ) {
|
||||
System.out.println( h.toString() );
|
||||
System.out.println( "> Cigar = " + h.getCigar() );
|
||||
}
|
||||
// Walk along the alignment and turn any difference from the reference into an event
|
||||
h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) );
|
||||
startPosKeySet.addAll(h.getEventMap().keySet());
|
||||
}
|
||||
|
||||
// Create the VC merge priority list
|
||||
final ArrayList<String> priorityList = new ArrayList<String>();
|
||||
for( int iii = 0; iii < haplotypes.size(); iii++ ) {
|
||||
priorityList.add("HC" + iii);
|
||||
}
|
||||
|
||||
// Walk along each position in the key set and create each event to be outputted
|
||||
for( final int loc : startPosKeySet ) {
|
||||
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) {
|
||||
final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final HashMap<Integer,VariantContext> eventMap = h.getEventMap();
|
||||
final VariantContext vc = eventMap.get(loc);
|
||||
if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) {
|
||||
eventsAtThisLoc.add(vc);
|
||||
}
|
||||
}
|
||||
|
||||
// Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event
|
||||
final ArrayList<ArrayList<Haplotype>> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
|
||||
|
||||
// Merge the event to find a common reference representation
|
||||
final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
|
||||
|
||||
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
|
||||
int aCount = 0;
|
||||
for( final Allele a : mergedVC.getAlleles() ) {
|
||||
alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper
|
||||
}
|
||||
|
||||
if( DEBUG ) {
|
||||
System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
|
||||
//System.out.println("Event/haplotype allele mapping = " + alleleMapper);
|
||||
}
|
||||
|
||||
// Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
|
||||
final GenotypesContext myGenotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size());
|
||||
for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
|
||||
final int myNumHaplotypes = alleleMapper.size();
|
||||
final double[] genotypeLikelihoods = new double[myNumHaplotypes * (myNumHaplotypes+1) / 2];
|
||||
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper);
|
||||
int glIndex = 0;
|
||||
for( int iii = 0; iii < myNumHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC
|
||||
}
|
||||
}
|
||||
|
||||
// using the allele mapping object translate the haplotype allele into the event allele
|
||||
final Genotype g = new GenotypeBuilder(sample)
|
||||
.alleles(findEventAllelesInSample(mergedVC.getAlleles(), call.getAlleles(), call.getGenotype(sample).getAlleles(), alleleMapper, haplotypes))
|
||||
.phased(loc != startPosKeySet.first())
|
||||
.PL(genotypeLikelihoods).make();
|
||||
myGenotypes.add(g);
|
||||
}
|
||||
returnCalls.add( new Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>(
|
||||
new VariantContextBuilder(mergedVC).log10PError(call.getLog10PError()).genotypes(myGenotypes).make(), alleleHashMap) );
|
||||
}
|
||||
}
|
||||
return returnCalls;
|
||||
}
|
||||
|
||||
// BUGBUG: Create a class to hold this complicated return type
|
||||
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
|
||||
public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine,
|
||||
final ArrayList<Haplotype> haplotypes,
|
||||
final byte[] ref,
|
||||
final GenomeLoc refLoc,
|
||||
final GenomeLoc activeRegionWindow,
|
||||
final GenomeLocParser genomeLocParser,
|
||||
final ArrayList<VariantContext> activeAllelesToGenotype ) {
|
||||
|
||||
final ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> returnCalls = new ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>>();
|
||||
final List<VariantContext> returnCalls = new ArrayList<VariantContext>();
|
||||
final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty();
|
||||
|
||||
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
|
||||
final TreeSet<Integer> startPosKeySet = new TreeSet<Integer>();
|
||||
|
|
@ -207,7 +76,7 @@ public class GenotypingEngine {
|
|||
for( final Haplotype h : haplotypes ) {
|
||||
// Walk along the alignment and turn any difference from the reference into an event
|
||||
h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) );
|
||||
if( activeAllelesToGenotype.isEmpty() ) { startPosKeySet.addAll(h.getEventMap().keySet()); }
|
||||
if( !in_GGA_mode ) { startPosKeySet.addAll(h.getEventMap().keySet()); }
|
||||
if( DEBUG ) {
|
||||
System.out.println( h.toString() );
|
||||
System.out.println( "> Cigar = " + h.getCigar() );
|
||||
|
|
@ -217,10 +86,10 @@ public class GenotypingEngine {
|
|||
}
|
||||
|
||||
cleanUpSymbolicUnassembledEvents( haplotypes );
|
||||
if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure
|
||||
mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc );
|
||||
if( !in_GGA_mode && samples.size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure
|
||||
mergeConsecutiveEventsBasedOnLD( haplotypes, samples, haplotypeReadMap, startPosKeySet, ref, refLoc );
|
||||
}
|
||||
if( !activeAllelesToGenotype.isEmpty() ) { // we are in GGA mode!
|
||||
if( in_GGA_mode ) {
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) {
|
||||
startPosKeySet.add( compVC.getStart() );
|
||||
}
|
||||
|
|
@ -228,11 +97,11 @@ public class GenotypingEngine {
|
|||
|
||||
// Walk along each position in the key set and create each event to be outputted
|
||||
for( final int loc : startPosKeySet ) {
|
||||
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) {
|
||||
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region
|
||||
final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>(); // the overlapping events to merge into a common reference view
|
||||
final ArrayList<String> priorityList = new ArrayList<String>(); // used to merge overlapping events into common reference view
|
||||
|
||||
if( activeAllelesToGenotype.isEmpty() ) {
|
||||
if( !in_GGA_mode ) {
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final HashMap<Integer,VariantContext> eventMap = h.getEventMap();
|
||||
final VariantContext vc = eventMap.get(loc);
|
||||
|
|
@ -261,7 +130,14 @@ public class GenotypingEngine {
|
|||
if( eventsAtThisLoc.isEmpty() ) { continue; }
|
||||
|
||||
// Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event
|
||||
final ArrayList<ArrayList<Haplotype>> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
|
||||
Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
|
||||
|
||||
final Allele refAllele = eventsAtThisLoc.get(0).getReference();
|
||||
final ArrayList<Allele> alleleOrdering = new ArrayList<Allele>(alleleMapper.size());
|
||||
alleleOrdering.add(refAllele);
|
||||
for( final VariantContext vc : eventsAtThisLoc ) {
|
||||
alleleOrdering.add(vc.getAlternateAllele(0));
|
||||
}
|
||||
|
||||
// Sanity check the priority list
|
||||
for( final VariantContext vc : eventsAtThisLoc ) {
|
||||
|
|
@ -283,23 +159,29 @@ public class GenotypingEngine {
|
|||
final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
|
||||
if( mergedVC == null ) { continue; }
|
||||
|
||||
HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
|
||||
int aCount = 0;
|
||||
for( final Allele a : mergedVC.getAlleles() ) {
|
||||
alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper
|
||||
// let's update the Allele keys in the mapper because they can change after merging when there are complex events
|
||||
final Map<Allele, List<Haplotype>> updatedAlleleMapper = new HashMap<Allele, List<Haplotype>>(alleleMapper.size());
|
||||
for ( int i = 0; i < mergedVC.getNAlleles(); i++ ) {
|
||||
final Allele oldAllele = alleleOrdering.get(i);
|
||||
final Allele newAllele = mergedVC.getAlleles().get(i);
|
||||
updatedAlleleMapper.put(newAllele, alleleMapper.get(oldAllele));
|
||||
alleleOrdering.set(i, newAllele);
|
||||
}
|
||||
alleleMapper = updatedAlleleMapper;
|
||||
|
||||
if( DEBUG ) {
|
||||
System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
|
||||
//System.out.println("Event/haplotype allele mapping = " + alleleMapper);
|
||||
}
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog );
|
||||
|
||||
// Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
|
||||
final GenotypesContext genotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size());
|
||||
for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
|
||||
final GenotypesContext genotypes = GenotypesContext.create(samples.size());
|
||||
for( final String sample : samples ) {
|
||||
final int numHaplotypes = alleleMapper.size();
|
||||
final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2];
|
||||
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper);
|
||||
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, alleleOrdering);
|
||||
int glIndex = 0;
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
|
|
@ -308,28 +190,58 @@ public class GenotypingEngine {
|
|||
}
|
||||
genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() );
|
||||
}
|
||||
VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
|
||||
final VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
|
||||
if( call != null ) {
|
||||
if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
|
||||
final VariantContext vcCallTrim = VariantContextUtils.reverseTrimAlleles(call);
|
||||
// also, need to update the allele -> haplotype mapping
|
||||
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMapTrim = new HashMap<Allele, ArrayList<Haplotype>>();
|
||||
for( int iii = 0; iii < vcCallTrim.getAlleles().size(); iii++ ) { // BUGBUG: this is assuming that the original and trimmed alleles maintain the same ordering in the VC
|
||||
alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleHashMap.get(call.getAlleles().get(iii)));
|
||||
}
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap, perSampleFilteredReadList, call );
|
||||
VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, call);
|
||||
|
||||
call = vcCallTrim;
|
||||
alleleHashMap = alleleHashMapTrim;
|
||||
if( annotatedCall.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
|
||||
annotatedCall = VariantContextUtils.reverseTrimAlleles(annotatedCall);
|
||||
}
|
||||
|
||||
returnCalls.add( new Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>(call, alleleHashMap) );
|
||||
returnCalls.add( annotatedCall );
|
||||
}
|
||||
}
|
||||
}
|
||||
return returnCalls;
|
||||
}
|
||||
|
||||
protected static void cleanUpSymbolicUnassembledEvents( final ArrayList<Haplotype> haplotypes ) {
|
||||
private static Map<String, PerReadAlleleLikelihoodMap> filterToOnlyOverlappingReads( final GenomeLocParser parser,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> perSampleReadMap,
|
||||
final Map<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,
|
||||
final VariantContext call ) {
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
|
||||
final GenomeLoc callLoc = parser.createGenomeLoc(call);
|
||||
for( final Map.Entry<String, PerReadAlleleLikelihoodMap> sample : perSampleReadMap.entrySet() ) {
|
||||
final PerReadAlleleLikelihoodMap likelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
|
||||
|
||||
for( final Map.Entry<GATKSAMRecord,Map<Allele,Double>> mapEntry : sample.getValue().getLikelihoodReadMap().entrySet() ) {
|
||||
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
|
||||
if( callLoc.overlapsP(parser.createGenomeLoc(mapEntry.getKey())) ) { // BUGBUG: This uses alignment start and stop, NOT soft start and soft end...
|
||||
for( final Map.Entry<Allele,Double> alleleDoubleEntry : mapEntry.getValue().entrySet() ) {
|
||||
likelihoodMap.add(mapEntry.getKey(), alleleDoubleEntry.getKey(), alleleDoubleEntry.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// add all filtered reads to the NO_CALL list because they weren't given any likelihoods
|
||||
for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) {
|
||||
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
|
||||
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
|
||||
for( final Allele allele : call.getAlleles() ) {
|
||||
likelihoodMap.add(read, allele, 0.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
returnMap.put(sample.getKey(), likelihoodMap);
|
||||
}
|
||||
return returnMap;
|
||||
}
|
||||
|
||||
|
||||
protected static void cleanUpSymbolicUnassembledEvents( final List<Haplotype> haplotypes ) {
|
||||
final ArrayList<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
for( final VariantContext vc : h.getEventMap().values() ) {
|
||||
|
|
@ -348,7 +260,41 @@ public class GenotypingEngine {
|
|||
haplotypes.removeAll(haplotypesToRemove);
|
||||
}
|
||||
|
||||
protected void mergeConsecutiveEventsBasedOnLD( final ArrayList<Haplotype> haplotypes, final TreeSet<Integer> startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) {
|
||||
// BUGBUG: ugh, too complicated
|
||||
protected Map<String, PerReadAlleleLikelihoodMap> convertHaplotypeReadMapToAlleleReadMap( final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
|
||||
final Map<Allele, List<Haplotype>> alleleMapper,
|
||||
final double downsamplingFraction,
|
||||
final PrintStream downsamplingLog ) {
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
|
||||
for( final Map.Entry<String, PerReadAlleleLikelihoodMap> haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
|
||||
for( final Map.Entry<Allele, List<Haplotype>> alleleMapperEntry : alleleMapper.entrySet() ) { // for each output allele
|
||||
final List<Haplotype> mappedHaplotypes = alleleMapperEntry.getValue();
|
||||
for( final Map.Entry<GATKSAMRecord, Map<Allele,Double>> readEntry : haplotypeReadMapEntry.getValue().getLikelihoodReadMap().entrySet() ) { // for each read
|
||||
double maxLikelihood = Double.NEGATIVE_INFINITY;
|
||||
for( final Map.Entry<Allele,Double> alleleDoubleEntry : readEntry.getValue().entrySet() ) { // for each input allele
|
||||
if( mappedHaplotypes.contains( new Haplotype(alleleDoubleEntry.getKey().getBases())) ) { // exact match of haplotype base string
|
||||
maxLikelihood = Math.max( maxLikelihood, alleleDoubleEntry.getValue() );
|
||||
}
|
||||
}
|
||||
perReadAlleleLikelihoodMap.add(readEntry.getKey(), alleleMapperEntry.getKey(), maxLikelihood);
|
||||
}
|
||||
}
|
||||
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog); // perform contamination downsampling
|
||||
alleleReadMap.put(haplotypeReadMapEntry.getKey(), perReadAlleleLikelihoodMap);
|
||||
}
|
||||
|
||||
return alleleReadMap;
|
||||
}
|
||||
|
||||
protected void mergeConsecutiveEventsBasedOnLD( final List<Haplotype> haplotypes,
|
||||
final List<String> samples,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
|
||||
final TreeSet<Integer> startPosKeySet,
|
||||
final byte[] ref,
|
||||
final GenomeLoc refLoc ) {
|
||||
|
||||
final int MAX_SIZE_TO_COMBINE = 15;
|
||||
final double MERGE_EVENTS_R2_THRESHOLD = 0.95;
|
||||
if( startPosKeySet.size() <= 1 ) { return; }
|
||||
|
|
@ -392,10 +338,13 @@ public class GenotypingEngine {
|
|||
}
|
||||
}
|
||||
// count up the co-occurrences of the events for the R^2 calculation
|
||||
final ArrayList<Haplotype> haplotypeList = new ArrayList<Haplotype>();
|
||||
haplotypeList.add(h);
|
||||
for( final String sample : haplotypes.get(0).getSampleKeySet() ) {
|
||||
final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( haplotypeList, sample )[0][0];
|
||||
for( final String sample : samples ) {
|
||||
final HashSet<String> sampleSet = new HashSet<String>(1);
|
||||
sampleSet.add(sample);
|
||||
|
||||
final List<Allele> alleleList = new ArrayList<Allele>();
|
||||
alleleList.add(Allele.create(h.getBases()));
|
||||
final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( sampleSet, haplotypeReadMap, alleleList )[0][0];
|
||||
if( thisHapVC == null ) {
|
||||
if( nextHapVC == null ) { x11 = MathUtils.approximateLog10SumLog10(x11, haplotypeLikelihood); }
|
||||
else { x12 = MathUtils.approximateLog10SumLog10(x12, haplotypeLikelihood); }
|
||||
|
|
@ -489,37 +438,87 @@ public class GenotypingEngine {
|
|||
|
||||
@Requires({"haplotypes.size() >= eventsAtThisLoc.size() + 1"})
|
||||
@Ensures({"result.size() == eventsAtThisLoc.size() + 1"})
|
||||
protected static ArrayList<ArrayList<Haplotype>> createAlleleMapper( final int loc, final ArrayList<VariantContext> eventsAtThisLoc, final ArrayList<Haplotype> haplotypes ) {
|
||||
final ArrayList<ArrayList<Haplotype>> alleleMapper = new ArrayList<ArrayList<Haplotype>>();
|
||||
final ArrayList<Haplotype> refList = new ArrayList<Haplotype>();
|
||||
protected static Map<Allele, List<Haplotype>> createAlleleMapper( final int loc, final List<VariantContext> eventsAtThisLoc, final List<Haplotype> haplotypes ) {
|
||||
|
||||
final Map<Allele, List<Haplotype>> alleleMapper = new HashMap<Allele, List<Haplotype>>(eventsAtThisLoc.size()+1);
|
||||
final Allele refAllele = eventsAtThisLoc.get(0).getReference();
|
||||
alleleMapper.put(refAllele, new ArrayList<Haplotype>());
|
||||
for( final VariantContext vc : eventsAtThisLoc )
|
||||
alleleMapper.put(vc.getAlternateAllele(0), new ArrayList<Haplotype>());
|
||||
|
||||
final ArrayList<Haplotype> undeterminedHaplotypes = new ArrayList<Haplotype>(haplotypes.size());
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
if( h.getEventMap().get(loc) == null ) { // no event at this location so this is a reference-supporting haplotype
|
||||
refList.add(h);
|
||||
if( h.isArtificialHaplotype() && loc == h.getArtificialAllelePosition() && alleleMapper.containsKey(h.getArtificialAllele()) ) {
|
||||
alleleMapper.get(h.getArtificialAllele()).add(h);
|
||||
} else if( h.getEventMap().get(loc) == null ) { // no event at this location so let's investigate later
|
||||
undeterminedHaplotypes.add(h);
|
||||
} else {
|
||||
boolean foundInEventList = false;
|
||||
boolean haplotypeIsDetermined = false;
|
||||
for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) {
|
||||
if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) {
|
||||
foundInEventList = true;
|
||||
alleleMapper.get(vcAtThisLoc.getAlternateAllele(0)).add(h);
|
||||
haplotypeIsDetermined = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( !foundInEventList ) { // event at this location isn't one of the genotype-able options (during GGA) so this is a reference-supporting haplotype
|
||||
refList.add(h);
|
||||
}
|
||||
|
||||
if( !haplotypeIsDetermined )
|
||||
undeterminedHaplotypes.add(h);
|
||||
}
|
||||
}
|
||||
alleleMapper.add(refList);
|
||||
for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) {
|
||||
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
if( h.getEventMap().get(loc) != null && h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) {
|
||||
list.add(h);
|
||||
|
||||
for( final Haplotype h : undeterminedHaplotypes ) {
|
||||
Allele matchingAllele = null;
|
||||
for( final Map.Entry<Allele, List<Haplotype>> alleleToTest : alleleMapper.entrySet() ) {
|
||||
// don't test against the reference allele
|
||||
if( alleleToTest.getKey().equals(refAllele) )
|
||||
continue;
|
||||
|
||||
final Haplotype artificialHaplotype = alleleToTest.getValue().get(0);
|
||||
if( isSubSetOf(artificialHaplotype.getEventMap(), h.getEventMap(), true) ) {
|
||||
matchingAllele = alleleToTest.getKey();
|
||||
break;
|
||||
}
|
||||
}
|
||||
alleleMapper.add(list);
|
||||
|
||||
if( matchingAllele == null )
|
||||
matchingAllele = refAllele;
|
||||
alleleMapper.get(matchingAllele).add(h);
|
||||
}
|
||||
|
||||
return alleleMapper;
|
||||
}
|
||||
|
||||
protected static boolean isSubSetOf(final Map<Integer, VariantContext> subset, final Map<Integer, VariantContext> superset, final boolean resolveSupersetToSubset) {
|
||||
|
||||
for ( final Map.Entry<Integer, VariantContext> fromSubset : subset.entrySet() ) {
|
||||
final VariantContext fromSuperset = superset.get(fromSubset.getKey());
|
||||
if ( fromSuperset == null )
|
||||
return false;
|
||||
|
||||
List<Allele> supersetAlleles = fromSuperset.getAlternateAlleles();
|
||||
if ( resolveSupersetToSubset )
|
||||
supersetAlleles = resolveAlternateAlleles(fromSubset.getValue().getReference(), fromSuperset.getReference(), supersetAlleles);
|
||||
|
||||
if ( !supersetAlleles.contains(fromSubset.getValue().getAlternateAllele(0)) )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static List<Allele> resolveAlternateAlleles(final Allele targetReference, final Allele actualReference, final List<Allele> currentAlleles) {
|
||||
if ( targetReference.length() <= actualReference.length() )
|
||||
return currentAlleles;
|
||||
|
||||
final List<Allele> newAlleles = new ArrayList<Allele>(currentAlleles.size());
|
||||
final byte[] extraBases = Arrays.copyOfRange(targetReference.getBases(), actualReference.length(), targetReference.length());
|
||||
for ( final Allele a : currentAlleles ) {
|
||||
newAlleles.add(Allele.extend(a, extraBases));
|
||||
}
|
||||
return newAlleles;
|
||||
}
|
||||
|
||||
@Ensures({"result.size() == haplotypeAllelesForSample.size()"})
|
||||
protected static List<Allele> findEventAllelesInSample( final List<Allele> eventAlleles, final List<Allele> haplotypeAlleles, final List<Allele> haplotypeAllelesForSample, final ArrayList<ArrayList<Haplotype>> alleleMapper, final ArrayList<Haplotype> haplotypes ) {
|
||||
if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; }
|
||||
|
|
|
|||
|
|
@ -26,7 +26,6 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
|
|
@ -41,8 +40,12 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
|
|
@ -129,14 +132,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
@Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false)
|
||||
protected int MIN_PRUNE_FACTOR = 1;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="genotypeFullActiveRegion", shortName="genotypeFullActiveRegion", doc = "If specified, alternate alleles are considered to be the full active region for the purposes of genotyping", required = false)
|
||||
protected boolean GENOTYPE_FULL_ACTIVE_REGION = false;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="fullHaplotype", shortName="fullHaplotype", doc = "If specified, output the full haplotype sequence instead of converting to individual variants w.r.t. the reference", required = false)
|
||||
protected boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE = false;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false)
|
||||
protected int gcpHMM = 10;
|
||||
|
|
@ -208,11 +203,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
// the genotyping engine
|
||||
private GenotypingEngine genotypingEngine = null;
|
||||
|
||||
// the annotation engine
|
||||
private VariantAnnotatorEngine annotationEngine;
|
||||
|
||||
// fasta reference reader to supplement the edges of the reference sequence
|
||||
private IndexedFastaSequenceFile referenceReader;
|
||||
private CachingIndexedFastaSequenceFile referenceReader;
|
||||
|
||||
// reference base padding size
|
||||
private static final int REFERENCE_PADDING = 900;
|
||||
|
|
@ -246,15 +238,16 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
|
||||
// create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested
|
||||
UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC);
|
||||
simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING );
|
||||
simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING );
|
||||
simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
|
||||
simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY;
|
||||
simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
simpleUAC.CONTAMINATION_FRACTION = 0.0;
|
||||
simpleUAC.exactCallsLog = null;
|
||||
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
|
||||
// initialize the output VCF header
|
||||
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
|
||||
final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
|
||||
|
||||
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
|
||||
|
||||
|
|
@ -271,15 +264,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
VCFConstants.GENOTYPE_QUALITY_KEY,
|
||||
VCFConstants.DEPTH_KEY,
|
||||
VCFConstants.GENOTYPE_PL_KEY);
|
||||
// header lines for the experimental HaplotypeCaller-specific annotations
|
||||
headerInfo.add(new VCFInfoHeaderLine("NVH", 1, VCFHeaderLineType.Integer, "Number of variants found on the haplotype that contained this variant"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("NumHapEval", 1, VCFHeaderLineType.Integer, "Number of haplotypes that were chosen for evaluation in this active region"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("NumHapAssembly", 1, VCFHeaderLineType.Integer, "Number of haplotypes created during the assembly of this active region"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("ActiveRegionSize", 1, VCFHeaderLineType.Integer, "Number of base pairs that comprise this active region"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("EVENTLENGTH", 1, VCFHeaderLineType.Integer, "Max length of all the alternate alleles"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("TYPE", 1, VCFHeaderLineType.String, "Type of event: SNP or INDEL"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("extType", 1, VCFHeaderLineType.String, "Extended type of event: SNP, MNP, INDEL, or COMPLEX"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("QDE", 1, VCFHeaderLineType.Float, "QD value divided by the number of variants found on the haplotype that contained this variant"));
|
||||
|
||||
// FILTER fields are added unconditionally as it's not always 100% certain the circumstances
|
||||
// where the filters are used. For example, in emitting all sites the lowQual field is used
|
||||
|
|
@ -296,7 +280,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
|
||||
assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter );
|
||||
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
|
||||
genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE );
|
||||
genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine );
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -309,9 +293,15 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
@Override
|
||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
||||
// enable non primary reads in the active region
|
||||
// enable non primary and extended reads in the active region
|
||||
@Override
|
||||
public boolean wantsNonPrimaryReads() { return true; }
|
||||
public EnumSet<ActiveRegionReadState> desiredReadStates() {
|
||||
return EnumSet.of(
|
||||
ActiveRegionReadState.PRIMARY,
|
||||
ActiveRegionReadState.NONPRIMARY,
|
||||
ActiveRegionReadState.EXTENDED
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
@Ensures({"result.isActiveProb >= 0.0", "result.isActiveProb <= 1.0"})
|
||||
|
|
@ -324,15 +314,15 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
}
|
||||
}
|
||||
if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) {
|
||||
return new ActivityProfileResult(1.0);
|
||||
return new ActivityProfileResult(ref.getLocus(), 1.0);
|
||||
}
|
||||
}
|
||||
|
||||
if( USE_ALLELES_TRIGGER ) {
|
||||
return new ActivityProfileResult( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 );
|
||||
return new ActivityProfileResult( ref.getLocus(), tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 );
|
||||
}
|
||||
|
||||
if( context == null ) { return new ActivityProfileResult(0.0); }
|
||||
if( context == null ) { return new ActivityProfileResult(ref.getLocus(), 0.0); }
|
||||
|
||||
final List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
|
||||
noCall.add(Allele.NO_CALL);
|
||||
|
|
@ -369,7 +359,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL);
|
||||
final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() );
|
||||
|
||||
return new ActivityProfileResult( isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileResult.ActivityProfileResultState.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileResult.ActivityProfileResultState.NONE, averageHQSoftClips.mean() );
|
||||
return new ActivityProfileResult( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileResult.ActivityProfileResultState.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileResult.ActivityProfileResultState.NONE, averageHQSoftClips.mean() );
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -412,60 +402,23 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() );
|
||||
|
||||
// evaluate each sample's reads against all haplotypes
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList = splitReadsBySample( activeRegion.getReads() );
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );
|
||||
likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, perSampleReadList );
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, splitReadsBySample( activeRegion.getReads() ) );
|
||||
final Map<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );
|
||||
|
||||
// subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes )
|
||||
final ArrayList<Haplotype> bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes ) : haplotypes );
|
||||
final ArrayList<Haplotype> bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes, stratifiedReadMap ) : haplotypes );
|
||||
|
||||
for( final Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>> callResult :
|
||||
( GENOTYPE_FULL_ACTIVE_REGION && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
|
||||
? genotypingEngine.assignGenotypeLikelihoodsAndCallHaplotypeEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getExtendedLoc(), getToolkit().getGenomeLocParser() )
|
||||
: genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) {
|
||||
if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); }
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog );
|
||||
final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst());
|
||||
final Map<String, Object> myAttributes = new LinkedHashMap<String, Object>(annotatedCall.getAttributes());
|
||||
|
||||
if( !GENOTYPE_FULL_ACTIVE_REGION ) {
|
||||
// add some custom annotations to the calls
|
||||
|
||||
// Calculate the number of variants on the haplotype
|
||||
int maxNumVar = 0;
|
||||
for( final Allele allele : callResult.getFirst().getAlleles() ) {
|
||||
if( !allele.isReference() ) {
|
||||
for( final Haplotype haplotype : callResult.getSecond().get(allele) ) {
|
||||
final int numVar = haplotype.getEventMap().size();
|
||||
if( numVar > maxNumVar ) { maxNumVar = numVar; }
|
||||
}
|
||||
}
|
||||
}
|
||||
// Calculate the event length
|
||||
int maxLength = 0;
|
||||
for ( final Allele a : annotatedCall.getAlternateAlleles() ) {
|
||||
final int length = a.length() - annotatedCall.getReference().length();
|
||||
if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; }
|
||||
}
|
||||
|
||||
myAttributes.put("NVH", maxNumVar);
|
||||
myAttributes.put("NumHapEval", bestHaplotypes.size());
|
||||
myAttributes.put("NumHapAssembly", haplotypes.size());
|
||||
myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size());
|
||||
myAttributes.put("EVENTLENGTH", maxLength);
|
||||
myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") );
|
||||
myAttributes.put("extType", annotatedCall.getType().toString() );
|
||||
|
||||
//if( likelihoodCalculationEngine.haplotypeScore != null ) {
|
||||
// myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore));
|
||||
//}
|
||||
if( annotatedCall.hasAttribute("QD") ) {
|
||||
myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) );
|
||||
}
|
||||
}
|
||||
|
||||
vcfWriter.add( new VariantContextBuilder(annotatedCall).attributes(myAttributes).make() );
|
||||
for( final VariantContext call : genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine,
|
||||
bestHaplotypes,
|
||||
samplesList,
|
||||
stratifiedReadMap,
|
||||
perSampleFilteredReadList,
|
||||
fullReferenceWithPadding,
|
||||
getPaddedLoc(activeRegion),
|
||||
activeRegion.getLocation(),
|
||||
getToolkit().getGenomeLocParser(),
|
||||
activeAllelesToGenotype ) ) {
|
||||
vcfWriter.add( call );
|
||||
}
|
||||
|
||||
if( DEBUG ) { System.out.println("----------------------------------------------------------------------------------"); }
|
||||
|
|
@ -520,6 +473,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
|
||||
final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
|
||||
// protect against INTERVALS with abnormally high coverage
|
||||
// BUGBUG: remove when positional downsampler is hooked up to ART/HC
|
||||
if( clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) {
|
||||
activeRegion.add(clippedRead);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -71,8 +71,9 @@ public class LikelihoodCalculationEngine {
|
|||
DEBUG = debug;
|
||||
}
|
||||
|
||||
public void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList ) {
|
||||
public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList ) {
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
|
||||
int X_METRIC_LENGTH = 0;
|
||||
for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sample : perSampleReadList.entrySet() ) {
|
||||
for( final GATKSAMRecord read : sample.getValue() ) {
|
||||
|
|
@ -97,20 +98,16 @@ public class LikelihoodCalculationEngine {
|
|||
for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sampleEntry : perSampleReadList.entrySet() ) {
|
||||
//if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); }
|
||||
// evaluate the likelihood of the reads given those haplotypes
|
||||
computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey() );
|
||||
stratifiedReadMap.put(sampleEntry.getKey(), computeReadLikelihoods(haplotypes, sampleEntry.getValue(), sampleEntry.getKey()));
|
||||
}
|
||||
return stratifiedReadMap;
|
||||
}
|
||||
|
||||
private void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final ArrayList<GATKSAMRecord> reads, final String sample ) {
|
||||
private PerReadAlleleLikelihoodMap computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final ArrayList<GATKSAMRecord> reads, final String sample ) {
|
||||
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
|
||||
final int numHaplotypes = haplotypes.size();
|
||||
final int numReads = reads.size();
|
||||
final double[][] readLikelihoods = new double[numHaplotypes][numReads];
|
||||
final int[][] readCounts = new int[numHaplotypes][numReads];
|
||||
for( int iii = 0; iii < numReads; iii++ ) {
|
||||
final GATKSAMRecord read = reads.get(iii);
|
||||
final int readCount = ReadUtils.getMeanRepresentativeReadCount(read);
|
||||
|
||||
for( final GATKSAMRecord read : reads ) {
|
||||
final byte[] overallGCP = new byte[read.getReadLength()];
|
||||
Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
|
||||
Haplotype previousHaplotypeSeen = null;
|
||||
|
|
@ -129,14 +126,12 @@ public class LikelihoodCalculationEngine {
|
|||
final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) );
|
||||
previousHaplotypeSeen = haplotype;
|
||||
|
||||
readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(),
|
||||
readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0);
|
||||
readCounts[jjj][iii] = readCount;
|
||||
perReadAlleleLikelihoodMap.add(read, Allele.create(haplotype.getBases()),
|
||||
pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(),
|
||||
readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0));
|
||||
}
|
||||
}
|
||||
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
|
||||
haplotypes.get(jjj).addReadLikelihoods( sample, readLikelihoods[jjj], readCounts[jjj] );
|
||||
}
|
||||
return perReadAlleleLikelihoodMap;
|
||||
}
|
||||
|
||||
private static int computeFirstDifferingPosition( final byte[] b1, final byte[] b2 ) {
|
||||
|
|
@ -148,64 +143,48 @@ public class LikelihoodCalculationEngine {
|
|||
return Math.min(b1.length, b2.length);
|
||||
}
|
||||
|
||||
@Requires({"haplotypes.size() > 0"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == haplotypes.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final ArrayList<Haplotype> haplotypes, final String sample ) {
|
||||
// set up the default 1-to-1 haplotype mapping object, BUGBUG: target for future optimization?
|
||||
final ArrayList<ArrayList<Haplotype>> haplotypeMapping = new ArrayList<ArrayList<Haplotype>>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
|
||||
list.add(h);
|
||||
haplotypeMapping.add(list);
|
||||
}
|
||||
return computeDiploidHaplotypeLikelihoods( sample, haplotypeMapping );
|
||||
}
|
||||
|
||||
// This function takes just a single sample and a haplotypeMapping
|
||||
@Requires({"haplotypeMapping.size() > 0"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final ArrayList<ArrayList<Haplotype>> haplotypeMapping ) {
|
||||
@Requires({"alleleOrdering.size() > 0"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final String sample,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap,
|
||||
final List<Allele> alleleOrdering ) {
|
||||
final TreeSet<String> sampleSet = new TreeSet<String>();
|
||||
sampleSet.add(sample);
|
||||
return computeDiploidHaplotypeLikelihoods(sampleSet, haplotypeMapping);
|
||||
return computeDiploidHaplotypeLikelihoods(sampleSet, stratifiedReadMap, alleleOrdering);
|
||||
}
|
||||
|
||||
// This function takes a set of samples to pool over and a haplotypeMapping
|
||||
@Requires({"haplotypeMapping.size() > 0"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples, final ArrayList<ArrayList<Haplotype>> haplotypeMapping ) {
|
||||
@Requires({"alleleOrdering.size() > 0"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap,
|
||||
final List<Allele> alleleOrdering ) {
|
||||
|
||||
final int numHaplotypes = haplotypeMapping.size();
|
||||
final int numHaplotypes = alleleOrdering.size();
|
||||
final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY);
|
||||
}
|
||||
|
||||
// compute the diploid haplotype likelihoods
|
||||
// todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
for( final Haplotype iii_mapped : haplotypeMapping.get(iii) ) {
|
||||
for( final Haplotype jjj_mapped : haplotypeMapping.get(jjj) ) {
|
||||
double haplotypeLikelihood = 0.0;
|
||||
for( final String sample : samples ) {
|
||||
final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample);
|
||||
final int[] readCounts_iii = iii_mapped.getReadCounts(sample);
|
||||
final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample);
|
||||
for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) {
|
||||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
|
||||
// First term is approximated by Jacobian log with table lookup.
|
||||
haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF );
|
||||
}
|
||||
}
|
||||
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood);
|
||||
final Allele iii_allele = alleleOrdering.get(iii);
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
final Allele jjj_allele = alleleOrdering.get(jjj);
|
||||
double haplotypeLikelihood = 0.0;
|
||||
for( final String sample : samples ) {
|
||||
for( final Map.Entry<GATKSAMRecord, Map<Allele,Double>> entry : stratifiedReadMap.get(sample).getLikelihoodReadMap().entrySet() ) {
|
||||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
|
||||
// First term is approximated by Jacobian log with table lookup.
|
||||
haplotypeLikelihood += ReadUtils.getMeanRepresentativeReadCount( entry.getKey() ) *
|
||||
( MathUtils.approximateLog10SumLog10(entry.getValue().get(iii_allele), entry.getValue().get(jjj_allele)) + LOG_ONE_HALF );
|
||||
}
|
||||
}
|
||||
}
|
||||
haplotypeLikelihoodMatrix[iii][jjj] = haplotypeLikelihood;
|
||||
}
|
||||
}
|
||||
|
||||
// normalize the diploid likelihoods matrix
|
||||
return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix );
|
||||
return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix );
|
||||
}
|
||||
|
||||
@Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"})
|
||||
|
|
@ -290,20 +269,16 @@ public class LikelihoodCalculationEngine {
|
|||
|
||||
@Requires({"haplotypes.size() > 0"})
|
||||
@Ensures({"result.size() <= haplotypes.size()"})
|
||||
public ArrayList<Haplotype> selectBestHaplotypes( final ArrayList<Haplotype> haplotypes ) {
|
||||
public ArrayList<Haplotype> selectBestHaplotypes( final ArrayList<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap ) {
|
||||
|
||||
final int numHaplotypes = haplotypes.size();
|
||||
final Set<String> sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples
|
||||
final Set<String> sampleKeySet = stratifiedReadMap.keySet();
|
||||
final ArrayList<Integer> bestHaplotypesIndexList = new ArrayList<Integer>();
|
||||
bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype
|
||||
// set up the default 1-to-1 haplotype mapping object
|
||||
final ArrayList<ArrayList<Haplotype>> haplotypeMapping = new ArrayList<ArrayList<Haplotype>>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
|
||||
list.add(h);
|
||||
haplotypeMapping.add(list);
|
||||
}
|
||||
final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, haplotypeMapping ); // all samples pooled together
|
||||
final List<Allele> haplotypesAsAlleles = new ArrayList<Allele>();
|
||||
for( final Haplotype h : haplotypes ) { haplotypesAsAlleles.add(Allele.create(h.getBases())); }
|
||||
|
||||
final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, stratifiedReadMap, haplotypesAsAlleles ); // all samples pooled together
|
||||
|
||||
int hap1 = 0;
|
||||
int hap2 = 0;
|
||||
|
|
@ -343,52 +318,4 @@ public class LikelihoodCalculationEngine {
|
|||
}
|
||||
throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
|
||||
}
|
||||
|
||||
public static Map<String, PerReadAlleleLikelihoodMap> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser,
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList,
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,
|
||||
final Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>> call,
|
||||
final double downsamplingFraction,
|
||||
final PrintStream downsamplingLog ) {
|
||||
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
|
||||
final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst());
|
||||
for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sample : perSampleReadList.entrySet() ) {
|
||||
final PerReadAlleleLikelihoodMap likelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
|
||||
|
||||
final ArrayList<GATKSAMRecord> readsForThisSample = sample.getValue();
|
||||
for( int iii = 0; iii < readsForThisSample.size(); iii++ ) {
|
||||
final GATKSAMRecord read = readsForThisSample.get(iii); // BUGBUG: assumes read order in this list and haplotype likelihood list are the same!
|
||||
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
|
||||
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
|
||||
for( final Allele a : call.getFirst().getAlleles() ) {
|
||||
double maxLikelihood = Double.NEGATIVE_INFINITY;
|
||||
for( final Haplotype h : call.getSecond().get(a) ) { // use the max likelihood from all the haplotypes which mapped to this allele (achieved via the haplotype mapper object)
|
||||
final double likelihood = h.getReadLikelihoods(sample.getKey())[iii];
|
||||
if( likelihood > maxLikelihood ) {
|
||||
maxLikelihood = likelihood;
|
||||
}
|
||||
}
|
||||
likelihoodMap.add(read, a, maxLikelihood);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// down-sample before adding filtered reads
|
||||
likelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog);
|
||||
|
||||
// add all filtered reads to the NO_CALL list because they weren't given any likelihoods
|
||||
for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) {
|
||||
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
|
||||
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
|
||||
for( final Allele a : call.getFirst().getAlleles() ) {
|
||||
likelihoodMap.add(read, a, 0.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
returnMap.put(sample.getKey(), likelihoodMap);
|
||||
|
||||
}
|
||||
return returnMap;
|
||||
}
|
||||
}
|
||||
|
|
@ -278,27 +278,34 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
|
|||
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
|
||||
final int activeRegionStop = refHaplotype.getAlignmentStartHapwrtRef() + refHaplotype.getCigar().getReferenceLength();
|
||||
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype
|
||||
// for GGA mode, add the desired allele into the haplotype
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) {
|
||||
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
|
||||
final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart());
|
||||
if( !addHaplotype( insertedRefHaplotype, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) {
|
||||
return returnHaplotypes;
|
||||
//throw new ReviewedStingException("Unable to add reference+allele haplotype during GGA-enabled assembly: " + insertedRefHaplotype);
|
||||
}
|
||||
final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart());
|
||||
addHaplotype( insertedRefHaplotype, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, true );
|
||||
}
|
||||
}
|
||||
|
||||
for( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph : graphs ) {
|
||||
for ( final KBestPaths.Path path : KBestPaths.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) {
|
||||
|
||||
final Haplotype h = new Haplotype( path.getBases( graph ), path.getScore() );
|
||||
if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) {
|
||||
if( !activeAllelesToGenotype.isEmpty() ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
|
||||
if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, false ) ) {
|
||||
|
||||
// for GGA mode, add the desired allele into the haplotype if it isn't already present
|
||||
if( !activeAllelesToGenotype.isEmpty() ) {
|
||||
final HashMap<Integer,VariantContext> eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), fullReferenceWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
|
||||
final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart());
|
||||
if( vcOnHaplotype == null || !vcOnHaplotype.hasSameAllelesAs(compVC) ) {
|
||||
|
||||
// This if statement used to additionally have:
|
||||
// "|| !vcOnHaplotype.hasSameAllelesAs(compVC)"
|
||||
// but that can lead to problems downstream when e.g. you are injecting a 1bp deletion onto
|
||||
// a haplotype that already contains a 1bp insertion (so practically it is reference but
|
||||
// falls into the bin for the 1bp deletion because we keep track of the artificial alleles).
|
||||
if( vcOnHaplotype == null ) {
|
||||
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
|
||||
addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop );
|
||||
addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, false );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -322,7 +329,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
|
|||
return returnHaplotypes;
|
||||
}
|
||||
|
||||
private boolean addHaplotype( final Haplotype haplotype, final byte[] ref, final ArrayList<Haplotype> haplotypeList, final int activeRegionStart, final int activeRegionStop ) {
|
||||
private boolean addHaplotype( final Haplotype haplotype, final byte[] ref, final ArrayList<Haplotype> haplotypeList, final int activeRegionStart, final int activeRegionStop, final boolean FORCE_INCLUSION_FOR_GGA_MODE ) {
|
||||
if( haplotype == null ) { return false; }
|
||||
|
||||
final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND );
|
||||
|
|
@ -369,13 +376,15 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
|
|||
|
||||
h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() );
|
||||
h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0) );
|
||||
if ( haplotype.isArtificialHaplotype() )
|
||||
h.setArtificialAllele(haplotype.getArtificialAllele(), haplotype.getArtificialAllelePosition());
|
||||
h.leftBreakPoint = leftBreakPoint;
|
||||
h.rightBreakPoint = rightBreakPoint;
|
||||
if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart ) { // protect against SW failures
|
||||
return false;
|
||||
}
|
||||
|
||||
if( !haplotypeList.contains(h) ) {
|
||||
if( FORCE_INCLUSION_FOR_GGA_MODE || !haplotypeList.contains(h) ) {
|
||||
haplotypeList.add(h);
|
||||
return true;
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,108 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
|
||||
/**
|
||||
* Basic unit test for AlleleBiasedDownsamplingUtils
|
||||
*/
|
||||
public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest {
|
||||
|
||||
|
||||
@Test
|
||||
public void testSmartDownsampling() {
|
||||
|
||||
final int[] idealHetAlleleCounts = new int[]{0, 50, 0, 50};
|
||||
final int[] idealHomAlleleCounts = new int[]{0, 100, 0, 0};
|
||||
|
||||
// no contamination, no removal
|
||||
testOneCase(0, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
|
||||
testOneCase(0, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
|
||||
|
||||
// hom sample, het contaminant, different alleles
|
||||
testOneCase(5, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
|
||||
testOneCase(0, 0, 5, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
|
||||
testOneCase(0, 0, 0, 5, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
|
||||
|
||||
// hom sample, hom contaminant, different alleles
|
||||
testOneCase(10, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
|
||||
testOneCase(0, 0, 10, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
|
||||
testOneCase(0, 0, 0, 10, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
|
||||
|
||||
// het sample, het contaminant, different alleles
|
||||
testOneCase(5, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
|
||||
testOneCase(0, 0, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
|
||||
|
||||
// het sample, hom contaminant, different alleles
|
||||
testOneCase(10, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
|
||||
testOneCase(0, 0, 10, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
|
||||
|
||||
// hom sample, het contaminant, overlapping alleles
|
||||
final int[] enhancedHomAlleleCounts = new int[]{0, 105, 0, 0};
|
||||
testOneCase(5, 5, 0, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts);
|
||||
testOneCase(0, 5, 5, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts);
|
||||
testOneCase(0, 5, 0, 5, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts);
|
||||
|
||||
// hom sample, hom contaminant, overlapping alleles
|
||||
testOneCase(0, 10, 0, 0, 0.1, 100, idealHomAlleleCounts, new int[]{0, 110, 0, 0});
|
||||
|
||||
// het sample, het contaminant, overlapping alleles
|
||||
testOneCase(5, 5, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
|
||||
testOneCase(0, 5, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
|
||||
testOneCase(0, 5, 0, 5, 0.1, 100, idealHetAlleleCounts, new int[]{0, 55, 0, 55});
|
||||
testOneCase(5, 0, 0, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
|
||||
testOneCase(0, 0, 5, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
|
||||
|
||||
// het sample, hom contaminant, overlapping alleles
|
||||
testOneCase(0, 10, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
|
||||
testOneCase(0, 0, 0, 10, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
|
||||
}
|
||||
|
||||
private static void testOneCase(final int addA, final int addC, final int addG, final int addT, final double contaminationFraction,
|
||||
final int pileupSize, final int[] initialCounts, final int[] targetCounts) {
|
||||
|
||||
final int[] actualCounts = initialCounts.clone();
|
||||
actualCounts[0] += addA;
|
||||
actualCounts[1] += addC;
|
||||
actualCounts[2] += addG;
|
||||
actualCounts[3] += addT;
|
||||
|
||||
final int[] results = AlleleBiasedDownsamplingUtils.runSmartDownsampling(actualCounts, (int)(pileupSize * contaminationFraction));
|
||||
Assert.assertTrue(countsAreEqual(results, targetCounts));
|
||||
}
|
||||
|
||||
private static boolean countsAreEqual(final int[] counts1, final int[] counts2) {
|
||||
for ( int i = 0; i < 4; i++ ) {
|
||||
if ( counts1[i] != counts2[i] )
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -37,6 +37,7 @@ public class BQSRIntegrationTest extends WalkerTest {
|
|||
" -L " + interval +
|
||||
args +
|
||||
" -knownSites " + (reference.equals(b36KGReference) ? b36dbSNP129 : hg18dbSNP132) +
|
||||
" --allow_potentially_misencoded_quality_scores" + // TODO -- remove me when we get new SOLiD bams
|
||||
" -o %s";
|
||||
}
|
||||
|
||||
|
|
@ -112,6 +113,7 @@ public class BQSRIntegrationTest extends WalkerTest {
|
|||
" -R " + b36KGReference +
|
||||
" -I " + privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam" +
|
||||
" -L 1:50,000-80,000" +
|
||||
" --allow_potentially_misencoded_quality_scores" + // TODO -- remove me when we get new SOLiD bams
|
||||
" -o %s",
|
||||
1, // just one output file
|
||||
UserException.class);
|
||||
|
|
|
|||
|
|
@ -14,6 +14,10 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
final String DIVIDEBYZERO_BAM = validationDataLocation + "ReduceReadsDivideByZeroBug.bam";
|
||||
final String DIVIDEBYZERO_L = " -L " + validationDataLocation + "ReduceReadsDivideByZeroBug.intervals";
|
||||
final String L = " -L 20:10,100,000-10,120,000 ";
|
||||
final String COREDUCTION_BAM_A = validationDataLocation + "coreduction.test.A.bam";
|
||||
final String COREDUCTION_BAM_B = validationDataLocation + "coreduction.test.B.bam";
|
||||
final String COREDUCTION_L = " -L 1:1,853,860-1,854,354 -L 1:1,884,131-1,892,057";
|
||||
final String OFFCONTIG_BAM = privateTestDir + "readOffb37contigMT.bam";
|
||||
|
||||
private void RRTest(String testName, String args, String md5) {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s ";
|
||||
|
|
@ -21,36 +25,36 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
executeTest(testName, spec);
|
||||
}
|
||||
|
||||
@Test(enabled = false)
|
||||
@Test(enabled = true)
|
||||
public void testDefaultCompression() {
|
||||
RRTest("testDefaultCompression ", L, "323dd4deabd7767efa0f2c6e7fa4189f");
|
||||
RRTest("testDefaultCompression ", L, "98080d3c53f441564796fc143cf510da");
|
||||
}
|
||||
|
||||
@Test(enabled = false)
|
||||
@Test(enabled = true)
|
||||
public void testMultipleIntervals() {
|
||||
String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110";
|
||||
RRTest("testMultipleIntervals ", intervals, "c437fb160547ff271f8eba30e5f3ff76");
|
||||
RRTest("testMultipleIntervals ", intervals, "c5dcdf4edf368b5b897d66f76034d9f0");
|
||||
}
|
||||
|
||||
@Test(enabled = false)
|
||||
@Test(enabled = true)
|
||||
public void testHighCompression() {
|
||||
RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "3a607bc3ebaf84e9dc44e005c5f8a047");
|
||||
RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "27cb99e87eda5e46187e56f50dd37f26");
|
||||
}
|
||||
|
||||
@Test(enabled = false)
|
||||
@Test(enabled = true)
|
||||
public void testLowCompression() {
|
||||
RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "7c9b4a70c2c90b0a995800aa42852e63");
|
||||
RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "4e7f111688d49973c35669855b7a2eaf");
|
||||
}
|
||||
|
||||
@Test(enabled = false)
|
||||
@Test(enabled = true)
|
||||
public void testIndelCompression() {
|
||||
RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "f7b9fa44c10bc4b2247813d2b8dc1973");
|
||||
RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "f6c9ea83608f35f113cf1f62a77ee6d0");
|
||||
}
|
||||
|
||||
@Test(enabled = false)
|
||||
@Test(enabled = true)
|
||||
public void testFilteredDeletionCompression() {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s ";
|
||||
executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("891bd6dcda66611f343e8ff25f34aaeb")));
|
||||
executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("122e4e60c4412a31d0aeb3cce879e841")));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -61,20 +65,36 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
*
|
||||
* This bam is simplified to replicate the exact bug with the three provided intervals.
|
||||
*/
|
||||
@Test(enabled = false)
|
||||
@Test(enabled = true)
|
||||
public void testAddingReadAfterTailingTheStash() {
|
||||
String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s ";
|
||||
executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("886b43e1f26ff18425814dc7563931c6")));
|
||||
executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("647b0f0f95730de8e6bc4f74186ad4df")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Divide by zero bug reported by GdA and users in the forum. Happens when the downsampler goes over a region where all reads get
|
||||
* filtered out.
|
||||
*/
|
||||
@Test(enabled = false)
|
||||
@Test(enabled = true)
|
||||
public void testDivideByZero() {
|
||||
String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s ";
|
||||
executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("93ffdc209d4cc0fc4f0169ca9be55cc2")));
|
||||
executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("2c87985972dd43ee9dd50b463d93a511")));
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testCoReduction() {
|
||||
String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s ";
|
||||
executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("5c30fde961a1357bf72c15144c01981b")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Bug happens when reads are soft-clipped off the contig (usually in the MT). This test guarantees no changes to the upstream code will
|
||||
* break the current hard-clipping routine that protects reduce reads from such reads.
|
||||
*/
|
||||
@Test(enabled = true)
|
||||
public void testReadOffContig() {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, OFFCONTIG_BAM) + " -o %s ";
|
||||
executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("2f17c1a78e9d0138217fdb83cede8f68")));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -80,11 +80,11 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test(enabled = true)
|
||||
public void testMT_SNP_DISCOVERY_sp4() {
|
||||
PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","dd568dc30be90135a3a8957a45a7321c");
|
||||
PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","3fc6f4d458313616727c60e49c0e852b");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMT_SNP_GGA_sp10() {
|
||||
PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "bf793c43b635a931207170be8035b288");
|
||||
PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "1bebbc0f28bff6fd64736ccca8839df8");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot1() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
|
||||
Arrays.asList("cdec335abc9ad8e59335e39a73e0e95a"));
|
||||
Arrays.asList("847605f4efafef89529fe0e496315edd"));
|
||||
executeTest("test MultiSample Pilot1", spec);
|
||||
}
|
||||
|
||||
|
|
@ -38,7 +38,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testWithAllelesPassedIn1() {
|
||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
|
||||
Arrays.asList("efddb5e258f97fd4f6661cff9eaa57de"));
|
||||
Arrays.asList("5b31b811072a4df04524e13604015f9b"));
|
||||
executeTest("test MultiSample Pilot2 with alleles passed in", spec1);
|
||||
}
|
||||
|
||||
|
|
@ -46,7 +46,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testWithAllelesPassedIn2() {
|
||||
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
|
||||
Arrays.asList("24532eb381724cd74e99370da28d49ed"));
|
||||
Arrays.asList("d9992e55381afb43742cc9b30fcd7538"));
|
||||
executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2);
|
||||
}
|
||||
|
||||
|
|
@ -54,7 +54,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testSingleSamplePilot2() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
|
||||
Arrays.asList("062a946160eec1d0fc135d58ca654ff4"));
|
||||
Arrays.asList("fea530fdc8677e10be4cc11625fa5376"));
|
||||
executeTest("test SingleSample Pilot2", spec);
|
||||
}
|
||||
|
||||
|
|
@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultipleSNPAlleles() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
|
||||
Arrays.asList("a373979d01c3a3fb20159235d27eb92c"));
|
||||
Arrays.asList("97df6c2a8d390d43b9bdf56c979d9b09"));
|
||||
executeTest("test Multiple SNP alleles", spec);
|
||||
}
|
||||
|
||||
|
|
@ -78,7 +78,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testReverseTrim() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
|
||||
Arrays.asList("9106d01ca0d0a8fedd068e72d509f380"));
|
||||
Arrays.asList("e14c9b1f9f34d6c16de445bfa385be89"));
|
||||
executeTest("test reverse trim", spec);
|
||||
}
|
||||
|
||||
|
|
@ -86,7 +86,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMismatchedPLs() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
|
||||
Arrays.asList("d847acf841ba8ba653f996ce4869f439"));
|
||||
Arrays.asList("935ee705ffe8cc6bf1d9efcceea271c8"));
|
||||
executeTest("test mismatched PLs", spec);
|
||||
}
|
||||
|
||||
|
|
@ -96,7 +96,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
private final static String COMPRESSED_OUTPUT_MD5 = "6792419c482e767a3deb28913ed2b1ad";
|
||||
private final static String COMPRESSED_OUTPUT_MD5 = "af8187e2baf516dde1cddea787a52b8a";
|
||||
|
||||
@Test
|
||||
public void testCompressedOutput() {
|
||||
|
|
@ -149,7 +149,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMinBaseQualityScore() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1,
|
||||
Arrays.asList("56157d930da6ccd224bce1ca93f11e41"));
|
||||
Arrays.asList("6ee6537e9ebc1bfc7c6cf8f04b1582ff"));
|
||||
executeTest("test min_base_quality_score 26", spec);
|
||||
}
|
||||
|
||||
|
|
@ -157,7 +157,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testSLOD() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
|
||||
Arrays.asList("6ccb9bd88934e4272d0ce362dd35e603"));
|
||||
Arrays.asList("55760482335497086458b09e415ecf54"));
|
||||
executeTest("test SLOD", spec);
|
||||
}
|
||||
|
||||
|
|
@ -165,7 +165,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testNDA() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
|
||||
Arrays.asList("480437dd6e2760f4ab3194431519f331"));
|
||||
Arrays.asList("938e888a40182878be4c3cc4859adb69"));
|
||||
executeTest("test NDA", spec);
|
||||
}
|
||||
|
||||
|
|
@ -173,11 +173,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testCompTrack() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
|
||||
Arrays.asList("22c039412fd387dde6125b07c9a74a25"));
|
||||
Arrays.asList("7dc186d420487e4e156a24ec8dea0951"));
|
||||
executeTest("test using comp track", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Test(enabled = false) // EB: for some reason this test crashes whenever I run it on my local machine
|
||||
public void testNoCmdLineHeaderStdout() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandNoCmdLineHeaderStdout + " -glm INDEL -L 1:67,225,396-67,288,518", 0,
|
||||
|
|
@ -187,17 +187,17 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testOutputParameterSitesOnly() {
|
||||
testOutputParameters("-sites_only", "40aeb4c9e31fe7046b72afc58e7599cb");
|
||||
testOutputParameters("-sites_only", "f99c7471127a6fb6f72e136bc873b2c9");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOutputParameterAllConfident() {
|
||||
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "c706ca93b25ff83613cb4e95dcac567c");
|
||||
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "9dbc9389db39cf9697e93e0bf529314f");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOutputParameterAllSites() {
|
||||
testOutputParameters("--output_mode EMIT_ALL_SITES", "5c75cecb523cac988beecd59186289ff");
|
||||
testOutputParameters("--output_mode EMIT_ALL_SITES", "8b26088a035e579c4afd3b46737291e4");
|
||||
}
|
||||
|
||||
private void testOutputParameters(final String args, final String md5) {
|
||||
|
|
@ -211,7 +211,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testConfidence() {
|
||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1,
|
||||
Arrays.asList("df524e98903d96ab9353bee7c16a69de"));
|
||||
Arrays.asList("4af83a883ecc03a23b0aa6dd4b8f1ceb"));
|
||||
executeTest("test confidence 1", spec1);
|
||||
}
|
||||
|
||||
|
|
@ -222,12 +222,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
// --------------------------------------------------------------------------------------------------------------
|
||||
@Test
|
||||
public void testHeterozyosity1() {
|
||||
testHeterozosity( 0.01, "8e61498ca03a8d805372a64c466b3b42" );
|
||||
testHeterozosity( 0.01, "8dd37249e0a80afa86594c3f1e720760" );
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHeterozyosity2() {
|
||||
testHeterozosity( 1.0 / 1850, "668d06b5173cf3b97d052726988e1d7b" );
|
||||
testHeterozosity( 1.0 / 1850, "040d169e20fda56f8de009a6015eb384" );
|
||||
}
|
||||
|
||||
private void testHeterozosity(final double arg, final String md5) {
|
||||
|
|
@ -251,7 +251,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" -L 1:10,000,000-10,100,000",
|
||||
1,
|
||||
Arrays.asList("908eb5e21fa39e7fb377cf4a9c4c7835"));
|
||||
Arrays.asList("0e4713e4aa44f4f8fcfea7138295a627"));
|
||||
|
||||
executeTest(String.format("test multiple technologies"), spec);
|
||||
}
|
||||
|
|
@ -270,7 +270,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -L 1:10,000,000-10,100,000" +
|
||||
" -baq CALCULATE_AS_NECESSARY",
|
||||
1,
|
||||
Arrays.asList("c814558bb0ed2e19b12e1a2bf4465d52"));
|
||||
Arrays.asList("46ea5d1ceb8eed1d0db63c3577915d6c"));
|
||||
|
||||
executeTest(String.format("test calling with BAQ"), spec);
|
||||
}
|
||||
|
|
@ -289,7 +289,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" -L 1:10,000,000-10,500,000",
|
||||
1,
|
||||
Arrays.asList("3593495aab5f6204c65de0b073a6ff65"));
|
||||
Arrays.asList("50329e15e5139be9e3b643f0b3ba8a53"));
|
||||
|
||||
executeTest(String.format("test indel caller in SLX"), spec);
|
||||
}
|
||||
|
|
@ -304,7 +304,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -minIndelCnt 1" +
|
||||
" -L 1:10,000,000-10,100,000",
|
||||
1,
|
||||
Arrays.asList("8b486a098029d5a106b0a37eff541c15"));
|
||||
Arrays.asList("2b85e3bd6bf981afaf7324666740d74b"));
|
||||
|
||||
executeTest(String.format("test indel caller in SLX with low min allele count"), spec);
|
||||
}
|
||||
|
|
@ -317,7 +317,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" -L 1:10,000,000-10,500,000",
|
||||
1,
|
||||
Arrays.asList("18efedc50cae2aacaba372265e38310b"));
|
||||
Arrays.asList("a6fd46eff78827060451a62cffd698a7"));
|
||||
|
||||
executeTest(String.format("test indel calling, multiple technologies"), spec);
|
||||
}
|
||||
|
|
@ -327,7 +327,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
|
||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
|
||||
Arrays.asList("3ff8c7c80a518aa3eb8671a21479de5f"));
|
||||
Arrays.asList("b8129bf754490cc3c76191d8cc4ec93f"));
|
||||
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
|
||||
}
|
||||
|
||||
|
|
@ -337,7 +337,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
|
||||
+ privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
|
||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
|
||||
Arrays.asList("578c0540f4f2052a634a829bcb9cc27d"));
|
||||
Arrays.asList("591332fa0b5b22778cf820ee257049d2"));
|
||||
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
|
||||
}
|
||||
|
||||
|
|
@ -345,13 +345,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSampleIndels1() {
|
||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
|
||||
Arrays.asList("a5a81bf1b10be860a6a5272fb928e8eb"));
|
||||
Arrays.asList("69df7a00f800204564ca3726e1871132"));
|
||||
List<File> result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst();
|
||||
|
||||
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
|
||||
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
|
||||
Arrays.asList("ad52814cd6c45df424fc992699feead6"));
|
||||
Arrays.asList("1256a7eceff2c2374c231ff981df486d"));
|
||||
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
|
||||
}
|
||||
|
||||
|
|
@ -407,7 +407,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMinIndelFraction0() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
assessMinIndelFraction + " -minIndelFrac 0.0", 1,
|
||||
Arrays.asList("857b8e5df444463ac27f665c4f67fbe2"));
|
||||
Arrays.asList("90adefd39ed67865b0cb275ad0f07383"));
|
||||
executeTest("test minIndelFraction 0.0", spec);
|
||||
}
|
||||
|
||||
|
|
@ -415,7 +415,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMinIndelFraction25() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
assessMinIndelFraction + " -minIndelFrac 0.25", 1,
|
||||
Arrays.asList("81d4c7d9010fd6733b2997bc378e7471"));
|
||||
Arrays.asList("2fded43949e258f8e9f68893c61c1bdd"));
|
||||
executeTest("test minIndelFraction 0.25", spec);
|
||||
}
|
||||
|
||||
|
|
@ -436,8 +436,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testNsInCigar() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + validationDataLocation + "testWithNs.bam -o %s -L 8:141799600-141814700", 1,
|
||||
Arrays.asList("f388d2ebb05e7269e7f0a7e9b8d2dbaa"));
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1,
|
||||
Arrays.asList("4d36969d4f8f1094f1fb6e7e085c19f6"));
|
||||
executeTest("test calling on reads with Ns in CIGAR", spec);
|
||||
}
|
||||
|
||||
|
|
@ -451,18 +451,18 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("9a7cd58b9e3d5b72608c0d529321deba"));
|
||||
Arrays.asList("092e42a712afb660ec79ff11c55933e2"));
|
||||
executeTest("test calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReducedBamSNPs() {
|
||||
testReducedCalling("SNP", "e7fc11baf208a1bca7b462d3148c936e");
|
||||
testReducedCalling("SNP", "c0de74ab8f4f14eb3a2c5d55c200ac5f");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReducedBamINDELs() {
|
||||
testReducedCalling("INDEL", "132a4e0ccf9230b5bb4b56c649e2bdd5");
|
||||
testReducedCalling("INDEL", "3c02ee5187933bed44dc416a2e28511f");
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -483,7 +483,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testContaminationDownsampling() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --contamination_fraction_to_filter 0.20", 1,
|
||||
Arrays.asList("27dd04159e06d9524fb8a4eef41f96ae"));
|
||||
Arrays.asList("1f9071466fc40f4c6a0f58ac8e9135fb"));
|
||||
executeTest("test contamination_percentage_to_filter 0.20", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -83,8 +83,8 @@ public class AFCalcResultUnitTest extends BaseTest {
|
|||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final List<Double> pValues = new LinkedList<Double>();
|
||||
for ( final double p : Arrays.asList(0.01, 0.1, 0.9, 0.99, 0.999) )
|
||||
for ( final double espilon : Arrays.asList(-1e-5, 0.0, 1e-5) )
|
||||
for ( final double p : Arrays.asList(0.01, 0.1, 0.9, 0.99, 0.999, 1 - 1e-4, 1 - 1e-5, 1 - 1e-6) )
|
||||
for ( final double espilon : Arrays.asList(-1e-7, 0.0, 1e-7) )
|
||||
pValues.add(p + espilon);
|
||||
|
||||
for ( final double pNonRef : pValues ) {
|
||||
|
|
@ -106,16 +106,16 @@ public class AFCalcResultUnitTest extends BaseTest {
|
|||
alleles,
|
||||
MathUtils.normalizeFromLog10(new double[]{1 - pNonRef, pNonRef}, true, false),
|
||||
log10Even,
|
||||
Collections.singletonMap(C, Math.log10(pNonRef)));
|
||||
Collections.singletonMap(C, Math.log10(1 - pNonRef)));
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "TestIsPolymorphic")
|
||||
private void testIsPolymorphic(final double pNonRef, final double pThreshold, final boolean shouldBePoly) {
|
||||
final AFCalcResult result = makePolymorphicTestData(pNonRef);
|
||||
final boolean actualIsPoly = result.isPolymorphic(C, Math.log10(pThreshold));
|
||||
Assert.assertEquals(actualIsPoly, shouldBePoly,
|
||||
"isPolymorphic with pNonRef " + pNonRef + " and threshold " + pThreshold + " returned "
|
||||
+ actualIsPoly + " but the expected result is " + shouldBePoly);
|
||||
final AFCalcResult result = makePolymorphicTestData(pNonRef);
|
||||
final boolean actualIsPoly = result.isPolymorphic(C, Math.log10(1 - pThreshold));
|
||||
Assert.assertEquals(actualIsPoly, shouldBePoly,
|
||||
"isPolymorphic with pNonRef " + pNonRef + " and threshold " + pThreshold + " returned "
|
||||
+ actualIsPoly + " but the expected result is " + shouldBePoly);
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "TestIsPolymorphic")
|
||||
|
|
|
|||
|
|
@ -681,7 +681,7 @@ public class AFCalcUnitTest extends BaseTest {
|
|||
|
||||
// must be getCalledChrCount because we cannot ensure that the VC made has our desired ACs
|
||||
Assert.assertEquals(result.getAlleleCountAtMLE(alt), vc.getCalledChrCount(alt));
|
||||
Assert.assertEquals(result.isPolymorphic(alt, -1), (boolean)expectedPoly.get(i), "isPolymorphic for allele " + alt + " " + result.getLog10PosteriorOfAFGt0ForAllele(alt));
|
||||
Assert.assertEquals(result.isPolymorphic(alt, -1), (boolean)expectedPoly.get(i), "isPolymorphic for allele " + alt + " " + result.getLog10PosteriorOfAFEq0ForAllele(alt));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -148,7 +148,7 @@ public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest {
|
|||
for ( int i = 0; i < log10LAlleles.size(); i++ ) {
|
||||
final double log10LAllele1 = log10LAlleles.get(i);
|
||||
final double[] L1 = MathUtils.normalizeFromLog10(new double[]{log10LAllele1, 0.0}, true);
|
||||
final AFCalcResult result1 = new AFCalcResult(new int[]{1}, 1, Arrays.asList(A, C), L1, rawPriors, Collections.singletonMap(C, 0.0));
|
||||
final AFCalcResult result1 = new AFCalcResult(new int[]{1}, 1, Arrays.asList(A, C), L1, rawPriors, Collections.singletonMap(C, -10000.0));
|
||||
originalPriors.add(result1);
|
||||
pNonRefN.add(log10pNonRef*(i+1));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -21,17 +21,19 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSample() {
|
||||
HCTest(CEUTRIO_BAM, "", "aa1df35d6e64d7ca93feb4d2dd15dd0e");
|
||||
HCTest(CEUTRIO_BAM, "", "d602d40852ad6d2d094be07e60cf95bd");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSample() {
|
||||
HCTest(NA12878_BAM, "", "186c7f322978283c01249c6de2829215");
|
||||
HCTest(NA12878_BAM, "", "70ad9d53dda4d302b879ca2b7dd5b368");
|
||||
}
|
||||
|
||||
// TODO -- add more tests for GGA mode, especially with input alleles that are complex variants and/or not trimmed
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGA() {
|
||||
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "de9e78a52207fe62144dba5337965469");
|
||||
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
|
||||
"fe84caa79f59ecbd98fcbcd5b30ab164");
|
||||
}
|
||||
|
||||
private void HCTestComplexVariants(String bam, String args, String md5) {
|
||||
|
|
@ -42,18 +44,18 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleComplex() {
|
||||
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "000dbb1b48f94d017cfec127c6cabe8f");
|
||||
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "883871f8bb4099f69fd804f8a6181954");
|
||||
}
|
||||
|
||||
private void HCTestSymbolicVariants(String bam, String args, String md5) {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 2";
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSampleSymbolic() {
|
||||
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "16013a9203367c3d1c4ce1dcdc81ef4a");
|
||||
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "338ab3b7dc3d54df8af94c0811028a75");
|
||||
}
|
||||
|
||||
private void HCTestIndelQualityScores(String bam, String args, String md5) {
|
||||
|
|
@ -64,20 +66,20 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "b369c2a6cb5c99a424551b33bae16f3b");
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "aff11b014ca42bfa301bcced5f5e54dd");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void HCTestProblematicReadsModifiedInActiveRegions() {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c306140ad28515ee06c603c225217939"));
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("2f4ed6dc969bee041215944a9b24328f"));
|
||||
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void HCTestStructuralIndels() {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("b6c67ee8e99cc8f53a6587bb26028047"));
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("d8d6f2ebe79bca81c8a0911daa153b89"));
|
||||
executeTest("HCTestStructuralIndels: ", spec);
|
||||
}
|
||||
|
||||
|
|
@ -91,7 +93,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void HCTestReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("4beb9f87ab3f316a9384c3d0dca6ebe9"));
|
||||
Arrays.asList("d01cb5f77ed5aca1d228cfbce9364c21"));
|
||||
executeTest("HC calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,7 +10,6 @@ import org.broadinstitute.sting.BaseTest;
|
|||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
|
|
@ -52,6 +51,8 @@ public class LikelihoodCalculationEngineUnitTest extends BaseTest {
|
|||
Assert.assertTrue(compareDoubleArrays(LikelihoodCalculationEngine.normalizeDiploidLikelihoodMatrixFromLog10(likelihoodMatrix2), normalizedMatrix2));
|
||||
}
|
||||
|
||||
// BUGBUG: LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods has changed! Need to make new unit tests!
|
||||
/*
|
||||
private class BasicLikelihoodTestProvider extends TestDataProvider {
|
||||
public Double readLikelihoodForHaplotype1;
|
||||
public Double readLikelihoodForHaplotype2;
|
||||
|
|
@ -102,7 +103,9 @@ public class LikelihoodCalculationEngineUnitTest extends BaseTest {
|
|||
haplotypes.add(haplotype);
|
||||
}
|
||||
}
|
||||
return LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(haplotypes, "myTestSample");
|
||||
final HashSet<String> sampleSet = new HashSet<String>(1);
|
||||
sampleSet.add("myTestSample");
|
||||
return LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sampleSet, haplotypes);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -151,10 +154,9 @@ public class LikelihoodCalculationEngineUnitTest extends BaseTest {
|
|||
logger.warn(String.format("Test: %s", cfg.toString()));
|
||||
Assert.assertTrue(compareDoubleArrays(calculatedMatrix, expectedMatrix));
|
||||
}
|
||||
*/
|
||||
|
||||
/**
|
||||
* Private function to compare 2d arrays
|
||||
*/
|
||||
//Private function to compare 2d arrays
|
||||
private boolean compareDoubleArrays(double[][] b1, double[][] b2) {
|
||||
if( b1.length != b2.length ) {
|
||||
return false; // sanity check
|
||||
|
|
|
|||
|
|
@ -445,13 +445,17 @@ public class GenomeAnalysisEngine {
|
|||
|
||||
protected DownsamplingMethod getDownsamplingMethod() {
|
||||
GATKArgumentCollection argCollection = this.getArguments();
|
||||
boolean useExperimentalDownsampling = argCollection.enableExperimentalDownsampling;
|
||||
|
||||
// Legacy downsampler can only be selected via the command line, not via walker annotations
|
||||
boolean useLegacyDownsampler = argCollection.useLegacyDownsampler;
|
||||
|
||||
DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod();
|
||||
DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker, useExperimentalDownsampling);
|
||||
DownsamplingMethod defaultMethod = DownsamplingMethod.getDefaultDownsamplingMethod(walker, useExperimentalDownsampling);
|
||||
DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker, useLegacyDownsampler);
|
||||
DownsamplingMethod defaultMethod = DownsamplingMethod.getDefaultDownsamplingMethod(walker, useLegacyDownsampler);
|
||||
|
||||
return commandLineMethod != null ? commandLineMethod : (walkerMethod != null ? walkerMethod : defaultMethod);
|
||||
DownsamplingMethod method = commandLineMethod != null ? commandLineMethod : (walkerMethod != null ? walkerMethod : defaultMethod);
|
||||
method.checkCompatibilityWithWalker(walker);
|
||||
return method;
|
||||
}
|
||||
|
||||
protected void setDownsamplingMethod(DownsamplingMethod method) {
|
||||
|
|
@ -580,9 +584,9 @@ public class GenomeAnalysisEngine {
|
|||
throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals.");
|
||||
}
|
||||
|
||||
// Use the experimental ReadShardBalancer if experimental downsampling is enabled
|
||||
ShardBalancer readShardBalancer = downsamplingMethod != null && downsamplingMethod.useExperimentalDownsampling ?
|
||||
new ExperimentalReadShardBalancer() :
|
||||
// Use the legacy ReadShardBalancer if legacy downsampling is enabled
|
||||
ShardBalancer readShardBalancer = downsamplingMethod != null && downsamplingMethod.useLegacyDownsampler ?
|
||||
new LegacyReadShardBalancer() :
|
||||
new ReadShardBalancer();
|
||||
|
||||
if(intervals == null)
|
||||
|
|
|
|||
|
|
@ -305,11 +305,23 @@ public class WalkerManager extends PluginManager<Walker> {
|
|||
* Gets the type of downsampling method requested by the walker. If an alternative
|
||||
* downsampling method is specified on the command-line, the command-line version will
|
||||
* be used instead.
|
||||
* @param walkerClass The class of the walker to interrogate.
|
||||
* @param useExperimentalDownsampling If true, use the experimental downsampling implementation
|
||||
* @param walker The walker to interrogate.
|
||||
* @param useLegacyDownsampler If true, use the legacy downsampling implementation
|
||||
* @return The downsampling method, as specified by the walker. Null if none exists.
|
||||
*/
|
||||
public static DownsamplingMethod getDownsamplingMethod(Class<? extends Walker> walkerClass, boolean useExperimentalDownsampling) {
|
||||
public static DownsamplingMethod getDownsamplingMethod(Walker walker, boolean useLegacyDownsampler) {
|
||||
return getDownsamplingMethod(walker.getClass(), useLegacyDownsampler);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the type of downsampling method requested by the walker. If an alternative
|
||||
* downsampling method is specified on the command-line, the command-line version will
|
||||
* be used instead.
|
||||
* @param walkerClass The class of the walker to interrogate.
|
||||
* @param useLegacyDownsampler If true, use the legacy downsampling implementation
|
||||
* @return The downsampling method, as specified by the walker. Null if none exists.
|
||||
*/
|
||||
public static DownsamplingMethod getDownsamplingMethod(Class<? extends Walker> walkerClass, boolean useLegacyDownsampler) {
|
||||
DownsamplingMethod downsamplingMethod = null;
|
||||
|
||||
if( walkerClass.isAnnotationPresent(Downsample.class) ) {
|
||||
|
|
@ -317,7 +329,7 @@ public class WalkerManager extends PluginManager<Walker> {
|
|||
DownsampleType type = downsampleParameters.by();
|
||||
Integer toCoverage = downsampleParameters.toCoverage() >= 0 ? downsampleParameters.toCoverage() : null;
|
||||
Double toFraction = downsampleParameters.toFraction() >= 0.0d ? downsampleParameters.toFraction() : null;
|
||||
downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction,useExperimentalDownsampling);
|
||||
downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction,useLegacyDownsampler);
|
||||
}
|
||||
|
||||
return downsamplingMethod;
|
||||
|
|
@ -331,18 +343,6 @@ public class WalkerManager extends PluginManager<Walker> {
|
|||
return walker.getClass().getAnnotation(BAQMode.class).ApplicationTime();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the type of downsampling method requested by the walker. If an alternative
|
||||
* downsampling method is specified on the command-line, the command-line version will
|
||||
* be used instead.
|
||||
* @param walker The walker to interrogate.
|
||||
* @param useExperimentalDownsampling If true, use the experimental downsampling implementation
|
||||
* @return The downsampling method, as specified by the walker. Null if none exists.
|
||||
*/
|
||||
public static DownsamplingMethod getDownsamplingMethod(Walker walker, boolean useExperimentalDownsampling) {
|
||||
return getDownsamplingMethod(walker.getClass(), useExperimentalDownsampling);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a name for this type of walker.
|
||||
*
|
||||
|
|
@ -350,11 +350,11 @@ public class WalkerManager extends PluginManager<Walker> {
|
|||
* @return A name for this type of walker.
|
||||
*/
|
||||
@Override
|
||||
public String getName(Class<? extends Walker> walkerType) {
|
||||
public String getName(Class walkerType) {
|
||||
String walkerName = "";
|
||||
|
||||
if (walkerType.getAnnotation(WalkerName.class) != null)
|
||||
walkerName = walkerType.getAnnotation(WalkerName.class).value().trim();
|
||||
walkerName = ((WalkerName)walkerType.getAnnotation(WalkerName.class)).value().trim();
|
||||
else
|
||||
walkerName = super.getName(walkerType);
|
||||
|
||||
|
|
|
|||
|
|
@ -162,12 +162,11 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction [0.0-1.0] of reads to downsample to", required = false)
|
||||
public Double downsampleFraction = null;
|
||||
|
||||
@Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus", required = false)
|
||||
@Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus. For non-locus-based traversals (eg., ReadWalkers), this sets the maximum number of reads at each alignment start position.", required = false)
|
||||
public Integer downsampleCoverage = null;
|
||||
|
||||
@Argument(fullName = "enable_experimental_downsampling", shortName = "enable_experimental_downsampling", doc = "Enable experimental engine-level downsampling", required = false)
|
||||
@Hidden
|
||||
public boolean enableExperimentalDownsampling = false;
|
||||
@Argument(fullName = "use_legacy_downsampler", shortName = "use_legacy_downsampler", doc = "Use the legacy downsampling implementation instead of the newer, less-tested implementation", required = false)
|
||||
public boolean useLegacyDownsampler = false;
|
||||
|
||||
/**
|
||||
* Gets the downsampling method explicitly specified by the user. If the user didn't specify
|
||||
|
|
@ -178,7 +177,7 @@ public class GATKArgumentCollection {
|
|||
if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null )
|
||||
return null;
|
||||
|
||||
return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction, enableExperimentalDownsampling);
|
||||
return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction, useLegacyDownsampler);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -192,7 +191,7 @@ public class GATKArgumentCollection {
|
|||
downsamplingType = method.type;
|
||||
downsampleCoverage = method.toCoverage;
|
||||
downsampleFraction = method.toFraction;
|
||||
enableExperimentalDownsampling = method.useExperimentalDownsampling;
|
||||
useLegacyDownsampler = method.useLegacyDownsampler;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -206,6 +205,22 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false)
|
||||
public double BAQGOP = BAQ.DEFAULT_GOP;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// quality encoding checking arguments
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Q0 == ASCII 33 according to the SAM specification, whereas Illumina encoding starts at Q64. The idea here is
|
||||
* simple: we just iterate over all reads and subtract 31 from every quality score.
|
||||
*/
|
||||
@Argument(fullName = "fix_misencoded_quality_scores", shortName="fixMisencodedQuals", doc="Fix mis-encoded base quality scores", required = false)
|
||||
public boolean FIX_MISENCODED_QUALS = false;
|
||||
|
||||
@Argument(fullName = "allow_potentially_misencoded_quality_scores", shortName="allowPotentiallyMisencodedQuals", doc="Do not fail when encountered base qualities that are too high and seemingly indicate a problem with the base quality encoding of the BAM file", required = false)
|
||||
public boolean ALLOW_POTENTIALLY_MISENCODED_QUALS = false;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// performance log arguments
|
||||
|
|
|
|||
|
|
@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.contexts;
|
|||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
|
@ -39,10 +38,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
|
|||
* @author hanna
|
||||
* @version 0.1
|
||||
*/
|
||||
|
||||
public class ReferenceContext {
|
||||
final public static boolean UPPERCASE_REFERENCE = true;
|
||||
|
||||
/**
|
||||
* Facilitates creation of new GenomeLocs.
|
||||
*/
|
||||
|
|
@ -59,7 +55,8 @@ public class ReferenceContext {
|
|||
final private GenomeLoc window;
|
||||
|
||||
/**
|
||||
* The bases in the window around the current locus. If null, then bases haven't been fetched yet
|
||||
* The bases in the window around the current locus. If null, then bases haven't been fetched yet.
|
||||
* Bases are always upper cased
|
||||
*/
|
||||
private byte[] basesCache = null;
|
||||
|
||||
|
|
@ -81,7 +78,7 @@ public class ReferenceContext {
|
|||
*
|
||||
* @return
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
@Ensures({"result != null"})
|
||||
public byte[] getBases();
|
||||
}
|
||||
|
||||
|
|
@ -146,7 +143,9 @@ public class ReferenceContext {
|
|||
private void fetchBasesFromProvider() {
|
||||
if ( basesCache == null ) {
|
||||
basesCache = basesProvider.getBases();
|
||||
if (UPPERCASE_REFERENCE) StringUtil.toUpperCase(basesCache);
|
||||
|
||||
// must be an assertion that only runs when the bases are fetch to run in a reasonable amount of time
|
||||
assert BaseUtils.isUpperCase(basesCache);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -194,6 +193,7 @@ public class ReferenceContext {
|
|||
/**
|
||||
* All the bases in the window from the current base forward to the end of the window.
|
||||
*/
|
||||
@Ensures({"result != null", "result.length > 0"})
|
||||
public byte[] getForwardBases() {
|
||||
final byte[] bases = getBases();
|
||||
final int mid = locus.getStart() - window.getStart();
|
||||
|
|
|
|||
|
|
@ -136,11 +136,12 @@ public abstract class LocusView extends LocusIterator implements View {
|
|||
// Cache the current and apply filtering.
|
||||
AlignmentContext current = nextLocus;
|
||||
|
||||
// The old ALL_READS downsampling implementation -- only use if we're not using the new experimental downsampling:
|
||||
if( ! sourceInfo.getDownsamplingMethod().useExperimentalDownsampling &&
|
||||
sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null ) {
|
||||
// The old ALL_READS downsampling implementation -- use only if legacy downsampling was requested:
|
||||
if ( sourceInfo.getDownsamplingMethod().useLegacyDownsampler &&
|
||||
sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS &&
|
||||
sourceInfo.getDownsamplingMethod().toCoverage != null ) {
|
||||
|
||||
current.downsampleToCoverage( sourceInfo.getDownsamplingMethod().toCoverage );
|
||||
current.downsampleToCoverage(sourceInfo.getDownsamplingMethod().toCoverage);
|
||||
}
|
||||
|
||||
// Indicate that the next operation will need to advance.
|
||||
|
|
|
|||
|
|
@ -134,12 +134,11 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
|
||||
// Only use the deprecated SAMDataSource.getCurrentPosition() if we're not using experimental downsampling
|
||||
// TODO: clean this up once the experimental downsampling engine fork collapses
|
||||
if ( dataSource.getReadsInfo().getDownsamplingMethod() != null && dataSource.getReadsInfo().getDownsamplingMethod().useExperimentalDownsampling ) {
|
||||
currentPosition = dataSource.getInitialReaderPositions();
|
||||
if ( dataSource.getReadsInfo().getDownsamplingMethod() != null && dataSource.getReadsInfo().getDownsamplingMethod().useLegacyDownsampler ) {
|
||||
currentPosition = dataSource.getCurrentPosition();
|
||||
}
|
||||
else {
|
||||
currentPosition = dataSource.getCurrentPosition();
|
||||
|
||||
currentPosition = dataSource.getInitialReaderPositions();
|
||||
}
|
||||
|
||||
for(SAMReaderID reader: dataSource.getReaderIDs())
|
||||
|
|
|
|||
|
|
@ -1,228 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Convert from an unbalanced iterator over FilePointers to a balanced iterator over Shards.
|
||||
*
|
||||
* When processing FilePointers, our strategy is to aggregate all FilePointers for each contig
|
||||
* together into one monolithic FilePointer, create one persistent set of read iterators over
|
||||
* that monolithic FilePointer, and repeatedly use that persistent set of read iterators to
|
||||
* fill read shards with reads.
|
||||
*
|
||||
* This strategy has several important advantages:
|
||||
*
|
||||
* 1. We avoid issues with file span overlap. FilePointers that are more granular than a whole
|
||||
* contig will have regions that overlap with other FilePointers on the same contig, due
|
||||
* to the limited granularity of BAM index data. By creating only one FilePointer per contig,
|
||||
* we avoid having to track how much of each file region we've visited (as we did in the
|
||||
* former implementation), we avoid expensive non-sequential access patterns in the files,
|
||||
* and we avoid having to repeatedly re-create our iterator chain for every small region
|
||||
* of interest.
|
||||
*
|
||||
* 2. We avoid boundary issues with the engine-level downsampling. Since we create a single
|
||||
* persistent set of read iterators (which include the downsampling iterator(s)) per contig,
|
||||
* the downsampling process is never interrupted by FilePointer or Shard boundaries, and never
|
||||
* loses crucial state information while downsampling within a contig.
|
||||
*
|
||||
* TODO: There is also at least one important disadvantage:
|
||||
*
|
||||
* 1. We load more BAM index data into memory at once, and this work is done upfront before processing
|
||||
* the next contig, creating a delay before traversal of each contig. This delay may be
|
||||
* compensated for by the gains listed in #1 above, and we may be no worse off overall in
|
||||
* terms of total runtime, but we need to verify this empirically.
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class ExperimentalReadShardBalancer extends ShardBalancer {
|
||||
|
||||
private static Logger logger = Logger.getLogger(ExperimentalReadShardBalancer.class);
|
||||
|
||||
/**
|
||||
* Convert iterators of file pointers into balanced iterators of shards.
|
||||
* @return An iterator over balanced shards.
|
||||
*/
|
||||
public Iterator<Shard> iterator() {
|
||||
return new Iterator<Shard>() {
|
||||
/**
|
||||
* The cached shard to be returned next. Prefetched in the peekable iterator style.
|
||||
*/
|
||||
private Shard nextShard = null;
|
||||
|
||||
/**
|
||||
* The file pointer currently being processed.
|
||||
*/
|
||||
private FilePointer currentContigFilePointer = null;
|
||||
|
||||
/**
|
||||
* Iterator over the reads from the current contig's file pointer. The same iterator will be
|
||||
* used to fill all shards associated with a given file pointer
|
||||
*/
|
||||
private PeekableIterator<SAMRecord> currentContigReadsIterator = null;
|
||||
|
||||
/**
|
||||
* How many FilePointers have we pulled from the filePointers iterator?
|
||||
*/
|
||||
private int totalFilePointersConsumed = 0;
|
||||
|
||||
/**
|
||||
* Have we encountered a monolithic FilePointer?
|
||||
*/
|
||||
private boolean encounteredMonolithicFilePointer = false;
|
||||
|
||||
|
||||
{
|
||||
createNextContigFilePointer();
|
||||
advance();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return nextShard != null;
|
||||
}
|
||||
|
||||
public Shard next() {
|
||||
if ( ! hasNext() )
|
||||
throw new NoSuchElementException("No next read shard available");
|
||||
Shard currentShard = nextShard;
|
||||
advance();
|
||||
return currentShard;
|
||||
}
|
||||
|
||||
private void advance() {
|
||||
nextShard = null;
|
||||
|
||||
// May need multiple iterations to fill the next shard if all reads in current file spans get filtered/downsampled away
|
||||
while ( nextShard == null && currentContigFilePointer != null ) {
|
||||
|
||||
// If we've exhausted the current file pointer of reads, move to the next file pointer (if there is one):
|
||||
if ( currentContigReadsIterator != null && ! currentContigReadsIterator.hasNext() ) {
|
||||
|
||||
// Close the old, exhausted chain of iterators to release resources
|
||||
currentContigReadsIterator.close();
|
||||
|
||||
// Advance to the FilePointer for the next contig
|
||||
createNextContigFilePointer();
|
||||
|
||||
// We'll need to create a fresh iterator for this file pointer when we create the first
|
||||
// shard for it below.
|
||||
currentContigReadsIterator = null;
|
||||
}
|
||||
|
||||
// At this point our currentContigReadsIterator may be null or non-null depending on whether or not
|
||||
// this is our first shard for this file pointer.
|
||||
if ( currentContigFilePointer != null ) {
|
||||
Shard shard = new ReadShard(parser,readsDataSource, currentContigFilePointer.fileSpans, currentContigFilePointer.locations, currentContigFilePointer.isRegionUnmapped);
|
||||
|
||||
// Create a new reads iterator only when we've just advanced to the file pointer for the next
|
||||
// contig. It's essential that the iterators persist across all shards that share the same contig
|
||||
// to allow the downsampling to work properly.
|
||||
if ( currentContigReadsIterator == null ) {
|
||||
currentContigReadsIterator = new PeekableIterator<SAMRecord>(readsDataSource.getIterator(shard));
|
||||
}
|
||||
|
||||
if ( currentContigReadsIterator.hasNext() ) {
|
||||
shard.fill(currentContigReadsIterator);
|
||||
nextShard = shard;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Aggregate all FilePointers for the next contig together into one monolithic FilePointer
|
||||
* to avoid boundary issues with visiting the same file regions more than once (since more
|
||||
* granular FilePointers will have regions that overlap with other nearby FilePointers due
|
||||
* to the nature of BAM indices).
|
||||
*
|
||||
* By creating one persistent set of iterators per contig we also avoid boundary artifacts
|
||||
* in the engine-level downsampling.
|
||||
*
|
||||
* TODO: This FilePointer aggregation should ideally be done at the BAMSchedule level for
|
||||
* TODO: read traversals, as there's little point in the BAMSchedule emitting extremely
|
||||
* TODO: granular FilePointers if we're just going to union them. The BAMSchedule should
|
||||
* TODO: emit one FilePointer per contig for read traversals (but, crucially, NOT for
|
||||
* TODO: locus traversals).
|
||||
*/
|
||||
private void createNextContigFilePointer() {
|
||||
currentContigFilePointer = null;
|
||||
List<FilePointer> nextContigFilePointers = new ArrayList<FilePointer>();
|
||||
|
||||
logger.info("Loading BAM index data for next contig");
|
||||
|
||||
while ( filePointers.hasNext() ) {
|
||||
|
||||
// Make sure that if we see a monolithic FilePointer (representing all regions in all files) that
|
||||
// it is the ONLY FilePointer we ever encounter
|
||||
if ( encounteredMonolithicFilePointer ) {
|
||||
throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer");
|
||||
}
|
||||
if ( filePointers.peek().isMonolithic() ) {
|
||||
if ( totalFilePointersConsumed > 0 ) {
|
||||
throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer");
|
||||
}
|
||||
encounteredMonolithicFilePointer = true;
|
||||
logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek()));
|
||||
}
|
||||
|
||||
// If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the
|
||||
// same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge
|
||||
if ( nextContigFilePointers.isEmpty() ||
|
||||
(! nextContigFilePointers.get(0).isRegionUnmapped && ! filePointers.peek().isRegionUnmapped &&
|
||||
nextContigFilePointers.get(0).getContigIndex() == filePointers.peek().getContigIndex()) ||
|
||||
(nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) {
|
||||
|
||||
nextContigFilePointers.add(filePointers.next());
|
||||
totalFilePointersConsumed++;
|
||||
}
|
||||
else {
|
||||
break; // next FilePointer is on a different contig or has different mapped/unmapped status,
|
||||
// save it for next time
|
||||
}
|
||||
}
|
||||
|
||||
if ( ! nextContigFilePointers.isEmpty() ) {
|
||||
currentContigFilePointer = FilePointer.union(nextContigFilePointers, parser);
|
||||
}
|
||||
|
||||
if ( currentContigFilePointer != null ) {
|
||||
logger.info("Done loading BAM index data for next contig");
|
||||
logger.debug(String.format("Next contig FilePointer: %s", currentContigFilePointer));
|
||||
}
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -282,7 +282,7 @@ public class GATKBAMIndex {
|
|||
final int nBins = readInteger();
|
||||
// System.out.println("# nBins: " + nBins);
|
||||
for (int j = 0; j < nBins; j++) {
|
||||
final int bin = readInteger();
|
||||
skipInteger();
|
||||
final int nChunks = readInteger();
|
||||
// System.out.println("# bin[" + j + "] = " + bin + ", nChunks = " + nChunks);
|
||||
skipBytes(16 * nChunks);
|
||||
|
|
@ -334,6 +334,10 @@ public class GATKBAMIndex {
|
|||
return buffer.getInt();
|
||||
}
|
||||
|
||||
private void skipInteger() {
|
||||
skipBytes(INT_SIZE_IN_BYTES);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads an array of <count> longs from the file channel, returning the results as an array.
|
||||
* @param count Number of longs to read.
|
||||
|
|
|
|||
|
|
@ -0,0 +1,129 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import net.sf.samtools.SAMFileSpan;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* Divide up large file pointers containing reads into more manageable subcomponents.
|
||||
*
|
||||
* TODO: delete this class once the experimental downsampling engine fork collapses
|
||||
*/
|
||||
public class LegacyReadShardBalancer extends ShardBalancer {
|
||||
/**
|
||||
* Convert iterators of file pointers into balanced iterators of shards.
|
||||
* @return An iterator over balanced shards.
|
||||
*/
|
||||
public Iterator<Shard> iterator() {
|
||||
return new Iterator<Shard>() {
|
||||
/**
|
||||
* The cached shard to be returned next. Prefetched in the peekable iterator style.
|
||||
*/
|
||||
private Shard nextShard = null;
|
||||
|
||||
/**
|
||||
* The file pointer currently being processed.
|
||||
*/
|
||||
private FilePointer currentFilePointer;
|
||||
|
||||
/**
|
||||
* Ending position of the last shard in the file.
|
||||
*/
|
||||
private Map<SAMReaderID,GATKBAMFileSpan> position = readsDataSource.getCurrentPosition();
|
||||
|
||||
{
|
||||
if(filePointers.hasNext())
|
||||
currentFilePointer = filePointers.next();
|
||||
advance();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return nextShard != null;
|
||||
}
|
||||
|
||||
public Shard next() {
|
||||
if(!hasNext())
|
||||
throw new NoSuchElementException("No next read shard available");
|
||||
Shard currentShard = nextShard;
|
||||
advance();
|
||||
return currentShard;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
|
||||
}
|
||||
|
||||
private void advance() {
|
||||
Map<SAMReaderID,SAMFileSpan> shardPosition;
|
||||
nextShard = null;
|
||||
|
||||
Map<SAMReaderID,SAMFileSpan> selectedReaders = new HashMap<SAMReaderID,SAMFileSpan>();
|
||||
while(selectedReaders.size() == 0 && currentFilePointer != null) {
|
||||
shardPosition = currentFilePointer.fileSpans;
|
||||
|
||||
for(SAMReaderID id: shardPosition.keySet()) {
|
||||
SAMFileSpan fileSpan = new GATKBAMFileSpan(shardPosition.get(id).removeContentsBefore(position.get(id)));
|
||||
selectedReaders.put(id,fileSpan);
|
||||
}
|
||||
|
||||
if(!isEmpty(selectedReaders)) {
|
||||
Shard shard = new ReadShard(parser,readsDataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped);
|
||||
readsDataSource.fillShard(shard);
|
||||
|
||||
if(!shard.isBufferEmpty()) {
|
||||
nextShard = shard;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
selectedReaders.clear();
|
||||
currentFilePointer = filePointers.hasNext() ? filePointers.next() : null;
|
||||
}
|
||||
|
||||
position = readsDataSource.getCurrentPosition();
|
||||
}
|
||||
|
||||
/**
|
||||
* Detects whether the list of file spans contain any read data.
|
||||
* @param selectedSpans Mapping of readers to file spans.
|
||||
* @return True if file spans are completely empty; false otherwise.
|
||||
*/
|
||||
private boolean isEmpty(Map<SAMReaderID,SAMFileSpan> selectedSpans) {
|
||||
for(SAMFileSpan fileSpan: selectedSpans.values()) {
|
||||
if(!fileSpan.isEmpty())
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
|
|
@ -24,20 +24,49 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import net.sf.samtools.SAMFileSpan;
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Divide up large file pointers containing reads into more manageable subcomponents.
|
||||
* Convert from an unbalanced iterator over FilePointers to a balanced iterator over Shards.
|
||||
*
|
||||
* TODO: delete this class once the experimental downsampling engine fork collapses
|
||||
* When processing FilePointers, our strategy is to aggregate all FilePointers for each contig
|
||||
* together into one monolithic FilePointer, create one persistent set of read iterators over
|
||||
* that monolithic FilePointer, and repeatedly use that persistent set of read iterators to
|
||||
* fill read shards with reads.
|
||||
*
|
||||
* This strategy has several important advantages:
|
||||
*
|
||||
* 1. We avoid issues with file span overlap. FilePointers that are more granular than a whole
|
||||
* contig will have regions that overlap with other FilePointers on the same contig, due
|
||||
* to the limited granularity of BAM index data. By creating only one FilePointer per contig,
|
||||
* we avoid having to track how much of each file region we've visited (as we did in the
|
||||
* former implementation), we avoid expensive non-sequential access patterns in the files,
|
||||
* and we avoid having to repeatedly re-create our iterator chain for every small region
|
||||
* of interest.
|
||||
*
|
||||
* 2. We avoid boundary issues with the engine-level downsampling. Since we create a single
|
||||
* persistent set of read iterators (which include the downsampling iterator(s)) per contig,
|
||||
* the downsampling process is never interrupted by FilePointer or Shard boundaries, and never
|
||||
* loses crucial state information while downsampling within a contig.
|
||||
*
|
||||
* TODO: There is also at least one important disadvantage:
|
||||
*
|
||||
* 1. We load more BAM index data into memory at once, and this work is done upfront before processing
|
||||
* the next contig, creating a delay before traversal of each contig. This delay may be
|
||||
* compensated for by the gains listed in #1 above, and we may be no worse off overall in
|
||||
* terms of total runtime, but we need to verify this empirically.
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class ReadShardBalancer extends ShardBalancer {
|
||||
|
||||
private static Logger logger = Logger.getLogger(ReadShardBalancer.class);
|
||||
|
||||
/**
|
||||
* Convert iterators of file pointers into balanced iterators of shards.
|
||||
* @return An iterator over balanced shards.
|
||||
|
|
@ -52,16 +81,27 @@ public class ReadShardBalancer extends ShardBalancer {
|
|||
/**
|
||||
* The file pointer currently being processed.
|
||||
*/
|
||||
private FilePointer currentFilePointer;
|
||||
private FilePointer currentContigFilePointer = null;
|
||||
|
||||
/**
|
||||
* Ending position of the last shard in the file.
|
||||
* Iterator over the reads from the current contig's file pointer. The same iterator will be
|
||||
* used to fill all shards associated with a given file pointer
|
||||
*/
|
||||
private Map<SAMReaderID,GATKBAMFileSpan> position = readsDataSource.getCurrentPosition();
|
||||
private PeekableIterator<SAMRecord> currentContigReadsIterator = null;
|
||||
|
||||
/**
|
||||
* How many FilePointers have we pulled from the filePointers iterator?
|
||||
*/
|
||||
private int totalFilePointersConsumed = 0;
|
||||
|
||||
/**
|
||||
* Have we encountered a monolithic FilePointer?
|
||||
*/
|
||||
private boolean encounteredMonolithicFilePointer = false;
|
||||
|
||||
|
||||
{
|
||||
if(filePointers.hasNext())
|
||||
currentFilePointer = filePointers.next();
|
||||
createNextContigFilePointer();
|
||||
advance();
|
||||
}
|
||||
|
||||
|
|
@ -70,58 +110,117 @@ public class ReadShardBalancer extends ShardBalancer {
|
|||
}
|
||||
|
||||
public Shard next() {
|
||||
if(!hasNext())
|
||||
if ( ! hasNext() )
|
||||
throw new NoSuchElementException("No next read shard available");
|
||||
Shard currentShard = nextShard;
|
||||
advance();
|
||||
return currentShard;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
|
||||
}
|
||||
|
||||
private void advance() {
|
||||
Map<SAMReaderID,SAMFileSpan> shardPosition;
|
||||
nextShard = null;
|
||||
|
||||
Map<SAMReaderID,SAMFileSpan> selectedReaders = new HashMap<SAMReaderID,SAMFileSpan>();
|
||||
while(selectedReaders.size() == 0 && currentFilePointer != null) {
|
||||
shardPosition = currentFilePointer.fileSpans;
|
||||
// May need multiple iterations to fill the next shard if all reads in current file spans get filtered/downsampled away
|
||||
while ( nextShard == null && currentContigFilePointer != null ) {
|
||||
|
||||
for(SAMReaderID id: shardPosition.keySet()) {
|
||||
SAMFileSpan fileSpan = new GATKBAMFileSpan(shardPosition.get(id).removeContentsBefore(position.get(id)));
|
||||
selectedReaders.put(id,fileSpan);
|
||||
// If we've exhausted the current file pointer of reads, move to the next file pointer (if there is one):
|
||||
if ( currentContigReadsIterator != null && ! currentContigReadsIterator.hasNext() ) {
|
||||
|
||||
// Close the old, exhausted chain of iterators to release resources
|
||||
currentContigReadsIterator.close();
|
||||
|
||||
// Advance to the FilePointer for the next contig
|
||||
createNextContigFilePointer();
|
||||
|
||||
// We'll need to create a fresh iterator for this file pointer when we create the first
|
||||
// shard for it below.
|
||||
currentContigReadsIterator = null;
|
||||
}
|
||||
|
||||
if(!isEmpty(selectedReaders)) {
|
||||
Shard shard = new ReadShard(parser,readsDataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped);
|
||||
readsDataSource.fillShard(shard);
|
||||
// At this point our currentContigReadsIterator may be null or non-null depending on whether or not
|
||||
// this is our first shard for this file pointer.
|
||||
if ( currentContigFilePointer != null ) {
|
||||
Shard shard = new ReadShard(parser,readsDataSource, currentContigFilePointer.fileSpans, currentContigFilePointer.locations, currentContigFilePointer.isRegionUnmapped);
|
||||
|
||||
if(!shard.isBufferEmpty()) {
|
||||
// Create a new reads iterator only when we've just advanced to the file pointer for the next
|
||||
// contig. It's essential that the iterators persist across all shards that share the same contig
|
||||
// to allow the downsampling to work properly.
|
||||
if ( currentContigReadsIterator == null ) {
|
||||
currentContigReadsIterator = new PeekableIterator<SAMRecord>(readsDataSource.getIterator(shard));
|
||||
}
|
||||
|
||||
if ( currentContigReadsIterator.hasNext() ) {
|
||||
shard.fill(currentContigReadsIterator);
|
||||
nextShard = shard;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
selectedReaders.clear();
|
||||
currentFilePointer = filePointers.hasNext() ? filePointers.next() : null;
|
||||
}
|
||||
|
||||
position = readsDataSource.getCurrentPosition();
|
||||
}
|
||||
|
||||
/**
|
||||
* Detects whether the list of file spans contain any read data.
|
||||
* @param selectedSpans Mapping of readers to file spans.
|
||||
* @return True if file spans are completely empty; false otherwise.
|
||||
* Aggregate all FilePointers for the next contig together into one monolithic FilePointer
|
||||
* to avoid boundary issues with visiting the same file regions more than once (since more
|
||||
* granular FilePointers will have regions that overlap with other nearby FilePointers due
|
||||
* to the nature of BAM indices).
|
||||
*
|
||||
* By creating one persistent set of iterators per contig we also avoid boundary artifacts
|
||||
* in the engine-level downsampling.
|
||||
*
|
||||
* TODO: This FilePointer aggregation should ideally be done at the BAMSchedule level for
|
||||
* TODO: read traversals, as there's little point in the BAMSchedule emitting extremely
|
||||
* TODO: granular FilePointers if we're just going to union them. The BAMSchedule should
|
||||
* TODO: emit one FilePointer per contig for read traversals (but, crucially, NOT for
|
||||
* TODO: locus traversals).
|
||||
*/
|
||||
private boolean isEmpty(Map<SAMReaderID,SAMFileSpan> selectedSpans) {
|
||||
for(SAMFileSpan fileSpan: selectedSpans.values()) {
|
||||
if(!fileSpan.isEmpty())
|
||||
return false;
|
||||
private void createNextContigFilePointer() {
|
||||
currentContigFilePointer = null;
|
||||
List<FilePointer> nextContigFilePointers = new ArrayList<FilePointer>();
|
||||
|
||||
logger.info("Loading BAM index data for next contig");
|
||||
|
||||
while ( filePointers.hasNext() ) {
|
||||
|
||||
// Make sure that if we see a monolithic FilePointer (representing all regions in all files) that
|
||||
// it is the ONLY FilePointer we ever encounter
|
||||
if ( encounteredMonolithicFilePointer ) {
|
||||
throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer");
|
||||
}
|
||||
if ( filePointers.peek().isMonolithic() ) {
|
||||
if ( totalFilePointersConsumed > 0 ) {
|
||||
throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer");
|
||||
}
|
||||
encounteredMonolithicFilePointer = true;
|
||||
logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek()));
|
||||
}
|
||||
|
||||
// If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the
|
||||
// same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge
|
||||
if ( nextContigFilePointers.isEmpty() ||
|
||||
(! nextContigFilePointers.get(0).isRegionUnmapped && ! filePointers.peek().isRegionUnmapped &&
|
||||
nextContigFilePointers.get(0).getContigIndex() == filePointers.peek().getContigIndex()) ||
|
||||
(nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) {
|
||||
|
||||
nextContigFilePointers.add(filePointers.next());
|
||||
totalFilePointersConsumed++;
|
||||
}
|
||||
else {
|
||||
break; // next FilePointer is on a different contig or has different mapped/unmapped status,
|
||||
// save it for next time
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
||||
if ( ! nextContigFilePointers.isEmpty() ) {
|
||||
currentContigFilePointer = FilePointer.union(nextContigFilePointers, parser);
|
||||
}
|
||||
|
||||
if ( currentContigFilePointer != null ) {
|
||||
logger.info("Done loading BAM index data for next contig");
|
||||
logger.debug(String.format("Next contig FilePointer: %s", currentContigFilePointer));
|
||||
}
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -30,12 +30,10 @@ import net.sf.samtools.*;
|
|||
import net.sf.samtools.util.CloseableIterator;
|
||||
import net.sf.samtools.util.RuntimeIOException;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.downsampling.*;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.ReadMetrics;
|
||||
import org.broadinstitute.sting.gatk.ReadProperties;
|
||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
import org.broadinstitute.sting.gatk.downsampling.*;
|
||||
import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator;
|
||||
import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
||||
import org.broadinstitute.sting.gatk.iterators.*;
|
||||
|
|
@ -468,7 +466,7 @@ public class SAMDataSource {
|
|||
/**
|
||||
* Legacy method to fill the given buffering shard with reads.
|
||||
*
|
||||
* Shard.fill() is used instead of this method when experimental downsampling is enabled
|
||||
* Shard.fill() is used instead of this method unless legacy downsampling is enabled
|
||||
*
|
||||
* TODO: delete this method once the experimental downsampling engine fork collapses
|
||||
*
|
||||
|
|
@ -567,7 +565,7 @@ public class SAMDataSource {
|
|||
*
|
||||
* @return the start positions of the first chunk of reads for all BAM files
|
||||
*/
|
||||
public Map<SAMReaderID, GATKBAMFileSpan> getInitialReaderPositions() {
|
||||
protected Map<SAMReaderID, GATKBAMFileSpan> getInitialReaderPositions() {
|
||||
Map<SAMReaderID, GATKBAMFileSpan> initialPositions = new HashMap<SAMReaderID, GATKBAMFileSpan>();
|
||||
SAMReaders readers = resourcePool.getAvailableReaders();
|
||||
|
||||
|
|
@ -585,7 +583,7 @@ public class SAMDataSource {
|
|||
* @param shard The shard specifying the data limits.
|
||||
* @return An iterator over the selected data.
|
||||
*/
|
||||
public StingSAMIterator getIterator( Shard shard ) {
|
||||
protected StingSAMIterator getIterator( Shard shard ) {
|
||||
return getIterator(resourcePool.getAvailableReaders(), shard, shard instanceof ReadShard);
|
||||
}
|
||||
|
||||
|
|
@ -640,7 +638,8 @@ public class SAMDataSource {
|
|||
readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
|
||||
readProperties.getSupplementalFilters(),
|
||||
readProperties.getReadTransformers(),
|
||||
readProperties.defaultBaseQualities());
|
||||
readProperties.defaultBaseQualities(),
|
||||
shard instanceof LocusShard);
|
||||
}
|
||||
|
||||
private class BAMCodecIterator implements CloseableIterator<SAMRecord> {
|
||||
|
|
@ -697,6 +696,7 @@ public class SAMDataSource {
|
|||
* @param noValidationOfReadOrder Another trigger for the verifying iterator? TODO: look into this.
|
||||
* @param supplementalFilters additional filters to apply to the reads.
|
||||
* @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality.
|
||||
* @param isLocusBasedTraversal true if we're dealing with a read stream from a LocusShard
|
||||
* @return An iterator wrapped with filters reflecting the passed-in parameters. Will not be null.
|
||||
*/
|
||||
protected StingSAMIterator applyDecoratingIterators(ReadMetrics readMetrics,
|
||||
|
|
@ -707,7 +707,8 @@ public class SAMDataSource {
|
|||
Boolean noValidationOfReadOrder,
|
||||
Collection<ReadFilter> supplementalFilters,
|
||||
List<ReadTransformer> readTransformers,
|
||||
byte defaultBaseQualities) {
|
||||
byte defaultBaseQualities,
|
||||
boolean isLocusBasedTraversal ) {
|
||||
|
||||
// ************************************************************************************************ //
|
||||
// * NOTE: ALL FILTERING/DOWNSAMPLING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * //
|
||||
|
|
@ -716,12 +717,26 @@ public class SAMDataSource {
|
|||
|
||||
wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters));
|
||||
|
||||
if ( readProperties.getDownsamplingMethod().useExperimentalDownsampling ) {
|
||||
wrappedIterator = applyDownsamplingIterator(wrappedIterator);
|
||||
// If we're using the new downsampling implementation, apply downsampling iterators at this
|
||||
// point in the read stream for most (but not all) cases
|
||||
if ( ! readProperties.getDownsamplingMethod().useLegacyDownsampler ) {
|
||||
|
||||
// For locus traversals where we're downsampling to coverage by sample, assume that the downsamplers
|
||||
// will be invoked downstream from us in LocusIteratorByState. This improves performance by avoiding
|
||||
// splitting/re-assembly of the read stream at this stage, and also allows for partial downsampling
|
||||
// of individual reads.
|
||||
boolean assumeDownstreamLIBSDownsampling = isLocusBasedTraversal &&
|
||||
readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE &&
|
||||
readProperties.getDownsamplingMethod().toCoverage != null;
|
||||
|
||||
if ( ! assumeDownstreamLIBSDownsampling ) {
|
||||
wrappedIterator = applyDownsamplingIterator(wrappedIterator);
|
||||
}
|
||||
}
|
||||
|
||||
// Use the old fractional downsampler only if we're not using experimental downsampling:
|
||||
if ( ! readProperties.getDownsamplingMethod().useExperimentalDownsampling && downsamplingFraction != null )
|
||||
// Use the old fractional downsampler only if we're using legacy downsampling:
|
||||
// TODO: remove this statement (and associated classes) once the downsampling engine fork collapses
|
||||
if ( readProperties.getDownsamplingMethod().useLegacyDownsampler && downsamplingFraction != null )
|
||||
wrappedIterator = new LegacyDownsampleIterator(wrappedIterator, downsamplingFraction);
|
||||
|
||||
// unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification,
|
||||
|
|
@ -743,19 +758,37 @@ public class SAMDataSource {
|
|||
}
|
||||
|
||||
protected StingSAMIterator applyDownsamplingIterator( StingSAMIterator wrappedIterator ) {
|
||||
if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) {
|
||||
ReadsDownsamplerFactory<SAMRecord> downsamplerFactory = readProperties.getDownsamplingMethod().toCoverage != null ?
|
||||
new SimplePositionalDownsamplerFactory<SAMRecord>(readProperties.getDownsamplingMethod().toCoverage) :
|
||||
new FractionalDownsamplerFactory<SAMRecord>(readProperties.getDownsamplingMethod().toFraction);
|
||||
|
||||
return new PerSampleDownsamplingReadsIterator(wrappedIterator, downsamplerFactory);
|
||||
if ( readProperties.getDownsamplingMethod() == null ||
|
||||
readProperties.getDownsamplingMethod().type == DownsampleType.NONE ) {
|
||||
return wrappedIterator;
|
||||
}
|
||||
else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) {
|
||||
ReadsDownsampler<SAMRecord> downsampler = readProperties.getDownsamplingMethod().toCoverage != null ?
|
||||
new SimplePositionalDownsampler<SAMRecord>(readProperties.getDownsamplingMethod().toCoverage) :
|
||||
new FractionalDownsampler<SAMRecord>(readProperties.getDownsamplingMethod().toFraction);
|
||||
|
||||
return new DownsamplingReadsIterator(wrappedIterator, downsampler);
|
||||
if ( readProperties.getDownsamplingMethod().toFraction != null ) {
|
||||
|
||||
// If we're downsampling to a fraction of reads, there's no point in paying the cost of
|
||||
// splitting/re-assembling the read stream by sample to run the FractionalDownsampler on
|
||||
// reads from each sample separately, since the result would be the same as running the
|
||||
// FractionalDownsampler on the entire stream. So, ALWAYS use the DownsamplingReadsIterator
|
||||
// rather than the PerSampleDownsamplingReadsIterator, even if BY_SAMPLE downsampling
|
||||
// was requested.
|
||||
|
||||
return new DownsamplingReadsIterator(wrappedIterator,
|
||||
new FractionalDownsampler<SAMRecord>(readProperties.getDownsamplingMethod().toFraction));
|
||||
}
|
||||
else if ( readProperties.getDownsamplingMethod().toCoverage != null ) {
|
||||
|
||||
// If we're downsampling to coverage, we DO need to pay the cost of splitting/re-assembling
|
||||
// the read stream to run the downsampler on the reads for each individual sample separately if
|
||||
// BY_SAMPLE downsampling was requested.
|
||||
|
||||
if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) {
|
||||
return new PerSampleDownsamplingReadsIterator(wrappedIterator,
|
||||
new SimplePositionalDownsamplerFactory<SAMRecord>(readProperties.getDownsamplingMethod().toCoverage));
|
||||
}
|
||||
else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) {
|
||||
return new DownsamplingReadsIterator(wrappedIterator,
|
||||
new SimplePositionalDownsampler<SAMRecord>(readProperties.getDownsamplingMethod().toCoverage));
|
||||
}
|
||||
}
|
||||
|
||||
return wrappedIterator;
|
||||
|
|
|
|||
|
|
@ -50,9 +50,9 @@ public class DownsamplingMethod {
|
|||
public final Double toFraction;
|
||||
|
||||
/**
|
||||
* Use the new experimental downsampling?
|
||||
* Use the legacy downsampling implementation instead of the newer implementation?
|
||||
*/
|
||||
public final boolean useExperimentalDownsampling;
|
||||
public final boolean useLegacyDownsampler;
|
||||
|
||||
/**
|
||||
* Expresses no downsampling applied at all.
|
||||
|
|
@ -69,11 +69,11 @@ public class DownsamplingMethod {
|
|||
*/
|
||||
public static int DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE = 1000;
|
||||
|
||||
public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useExperimentalDownsampling ) {
|
||||
public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useLegacyDownsampler ) {
|
||||
this.type = type != null ? type : DEFAULT_DOWNSAMPLING_TYPE;
|
||||
this.toCoverage = toCoverage;
|
||||
this.toFraction = toFraction;
|
||||
this.useExperimentalDownsampling = useExperimentalDownsampling;
|
||||
this.useLegacyDownsampler = useLegacyDownsampler;
|
||||
|
||||
if ( type == DownsampleType.NONE ) {
|
||||
toCoverage = null;
|
||||
|
|
@ -101,19 +101,19 @@ public class DownsamplingMethod {
|
|||
if ( toFraction != null && (toFraction < 0.0 || toFraction > 1.0) ) {
|
||||
throw new UserException.CommandLineException("toFraction must be >= 0.0 and <= 1.0 when downsampling to a fraction of reads");
|
||||
}
|
||||
}
|
||||
|
||||
// Some restrictions only exist for the old downsampling implementation:
|
||||
if ( ! useExperimentalDownsampling ) {
|
||||
// By sample downsampling does not work with a fraction of reads in the old downsampling implementation
|
||||
if( type == DownsampleType.BY_SAMPLE && toFraction != null )
|
||||
throw new UserException.CommandLineException("Cannot downsample to fraction with the BY_SAMPLE method");
|
||||
public void checkCompatibilityWithWalker( Walker walker ) {
|
||||
boolean isLocusTraversal = walker instanceof LocusWalker || walker instanceof ActiveRegionWalker;
|
||||
|
||||
if ( ! isLocusTraversal && useLegacyDownsampler && toCoverage != null ) {
|
||||
throw new UserException.CommandLineException("Downsampling to coverage for read-based traversals (eg., ReadWalkers) is not supported in the legacy downsampling implementation. " +
|
||||
"The newer downsampling implementation does not have this limitation.");
|
||||
}
|
||||
|
||||
// Some restrictions only exist for the new downsampling implementation:
|
||||
if ( useExperimentalDownsampling ) {
|
||||
if ( type == DownsampleType.ALL_READS && toCoverage != null ) {
|
||||
throw new UserException.CommandLineException("Cannot downsample to coverage with the ALL_READS method in the experimental downsampling implementation");
|
||||
}
|
||||
if ( isLocusTraversal && ! useLegacyDownsampler && type == DownsampleType.ALL_READS && toCoverage != null ) {
|
||||
throw new UserException.CommandLineException("Downsampling to coverage with the ALL_READS method for locus-based traversals (eg., LocusWalkers) is not yet supported in the new downsampling implementation (though it is supported for ReadWalkers). " +
|
||||
"You can run with --use_legacy_downsampler for a broken and poorly-maintained implementation of ALL_READS to-coverage downsampling, but this is not recommended.");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -124,30 +124,34 @@ public class DownsamplingMethod {
|
|||
builder.append("No downsampling");
|
||||
}
|
||||
else {
|
||||
builder.append(String.format("Method: %s ", type));
|
||||
builder.append(String.format("Method: %s, ", type));
|
||||
|
||||
if ( toCoverage != null ) {
|
||||
builder.append(String.format("Target Coverage: %d ", toCoverage));
|
||||
builder.append(String.format("Target Coverage: %d, ", toCoverage));
|
||||
}
|
||||
else {
|
||||
builder.append(String.format("Target Fraction: %.2f ", toFraction));
|
||||
builder.append(String.format("Target Fraction: %.2f, ", toFraction));
|
||||
}
|
||||
|
||||
if ( useExperimentalDownsampling ) {
|
||||
builder.append("Using Experimental Downsampling");
|
||||
if ( useLegacyDownsampler ) {
|
||||
builder.append("Using the legacy downsampling implementation");
|
||||
}
|
||||
else {
|
||||
builder.append("Using the new downsampling implementation");
|
||||
}
|
||||
}
|
||||
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useExperimentalDownsampling ) {
|
||||
public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useLegacyDownsampler ) {
|
||||
if ( walker instanceof LocusWalker || walker instanceof ActiveRegionWalker ) {
|
||||
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE, DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE,
|
||||
null, useExperimentalDownsampling);
|
||||
null, useLegacyDownsampler);
|
||||
}
|
||||
else {
|
||||
return new DownsamplingMethod(DownsampleType.NONE, null, null, useExperimentalDownsampling);
|
||||
// Downsampling is off by default for non-locus-based traversals
|
||||
return new DownsamplingMethod(DownsampleType.NONE, null, null, useLegacyDownsampler);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,106 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Pass-Through Downsampler: Implementation of the ReadsDownsampler interface that does no
|
||||
* downsampling whatsoever, and instead simply "passes-through" all the reads it's given.
|
||||
* Useful for situations where you want to disable downsampling, but still need to use
|
||||
* the downsampler interface.
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class PassThroughDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
|
||||
|
||||
private ArrayList<T> selectedReads;
|
||||
|
||||
public PassThroughDownsampler() {
|
||||
clear();
|
||||
}
|
||||
|
||||
public void submit( T newRead ) {
|
||||
// All reads pass-through, no reads get downsampled
|
||||
selectedReads.add(newRead);
|
||||
}
|
||||
|
||||
public void submit( Collection<T> newReads ) {
|
||||
for ( T read : newReads ) {
|
||||
submit(read);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasFinalizedItems() {
|
||||
return selectedReads.size() > 0;
|
||||
}
|
||||
|
||||
public List<T> consumeFinalizedItems() {
|
||||
// pass by reference rather than make a copy, for speed
|
||||
List<T> downsampledItems = selectedReads;
|
||||
clear();
|
||||
return downsampledItems;
|
||||
}
|
||||
|
||||
public boolean hasPendingItems() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public T peekFinalized() {
|
||||
return selectedReads.isEmpty() ? null : selectedReads.get(0);
|
||||
}
|
||||
|
||||
public T peekPending() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public int getNumberOfDiscardedItems() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public void signalEndOfInput() {
|
||||
// NO-OP
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
selectedReads = new ArrayList<T>();
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
// NO-OP
|
||||
}
|
||||
|
||||
public boolean requiresCoordinateSortOrder() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public void signalNoMoreReadsBefore( T read ) {
|
||||
// NO-OP
|
||||
}
|
||||
}
|
||||
|
|
@ -43,7 +43,6 @@ import org.broadinstitute.sting.utils.AutoFormattingTime;
|
|||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler;
|
||||
import org.broadinstitute.sting.utils.progressmeter.ProgressMeter;
|
||||
import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor;
|
||||
|
||||
|
|
@ -346,9 +345,6 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
|
|||
for ( final TraversalEngine te : allCreatedTraversalEngines)
|
||||
te.shutdown();
|
||||
|
||||
// horrible hack to print nano scheduling information across all nano schedulers, if any were used
|
||||
NanoScheduler.printCombinedRuntimeProfile();
|
||||
|
||||
allCreatedTraversalEngines.clear();
|
||||
availableTraversalEngines.clear();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,9 +4,9 @@ import net.sf.picard.util.PeekableIterator;
|
|||
import org.broadinstitute.sting.gatk.ReadProperties;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
||||
import org.broadinstitute.sting.gatk.iterators.LegacyLocusIteratorByState;
|
||||
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState;
|
||||
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByStateExperimental;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
|
@ -83,17 +83,18 @@ public class WindowMaker implements Iterable<WindowMaker.WindowMakerIterator>, I
|
|||
this.sourceInfo = shard.getReadProperties();
|
||||
this.readIterator = iterator;
|
||||
|
||||
// Temporary: use the experimental version of LocusIteratorByState if experimental downsampling was requested:
|
||||
this.sourceIterator = sourceInfo.getDownsamplingMethod().useExperimentalDownsampling ?
|
||||
new PeekableIterator<AlignmentContext>(new LocusIteratorByStateExperimental(iterator,sourceInfo,genomeLocParser, sampleNames))
|
||||
// Use the legacy version of LocusIteratorByState if legacy downsampling was requested:
|
||||
this.sourceIterator = sourceInfo.getDownsamplingMethod().useLegacyDownsampler ?
|
||||
new PeekableIterator<AlignmentContext>(new LegacyLocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleNames))
|
||||
:
|
||||
new PeekableIterator<AlignmentContext>(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames));
|
||||
new PeekableIterator<AlignmentContext>(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleNames));
|
||||
|
||||
|
||||
this.intervalIterator = intervals.size()>0 ? new PeekableIterator<GenomeLoc>(intervals.iterator()) : null;
|
||||
}
|
||||
|
||||
public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List<GenomeLoc> intervals ) {
|
||||
this(shard, genomeLocParser, iterator, intervals, LocusIteratorByState.sampleListForSAMWithoutReadGroups());
|
||||
this(shard, genomeLocParser, iterator, intervals, LegacyLocusIteratorByState.sampleListForSAMWithoutReadGroups());
|
||||
}
|
||||
|
||||
public Iterator<WindowMakerIterator> iterator() {
|
||||
|
|
|
|||
|
|
@ -31,14 +31,14 @@ import net.sf.samtools.CigarElement;
|
|||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.ReadProperties;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.downsampling.Downsampler;
|
||||
import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.LegacyReservoirDownsampler;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
|
|
@ -50,11 +50,11 @@ import java.util.*;
|
|||
/**
|
||||
* Iterator that traverses a SAM File, accumulating information on a per-locus basis
|
||||
*/
|
||||
public class LocusIteratorByStateExperimental extends LocusIterator {
|
||||
public class LegacyLocusIteratorByState extends LocusIterator {
|
||||
/**
|
||||
* our log, which we want to capture anything from this class
|
||||
*/
|
||||
private static Logger logger = Logger.getLogger(LocusIteratorByState.class);
|
||||
private static Logger logger = Logger.getLogger(LegacyLocusIteratorByState.class);
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -69,7 +69,7 @@ public class LocusIteratorByStateExperimental extends LocusIterator {
|
|||
private final ArrayList<String> samples;
|
||||
private final ReadStateManager readStates;
|
||||
|
||||
protected static class SAMRecordState {
|
||||
static private class SAMRecordState {
|
||||
SAMRecord read;
|
||||
int readOffset = -1; // how far are we offset from the start of the read bases?
|
||||
int genomeOffset = -1; // how far are we offset from the alignment start on the genome?
|
||||
|
|
@ -213,7 +213,6 @@ public class LocusIteratorByStateExperimental extends LocusIterator {
|
|||
//final boolean DEBUG2 = false && DEBUG;
|
||||
private ReadProperties readInfo;
|
||||
private AlignmentContext nextAlignmentContext;
|
||||
private boolean performLevelingDownsampling;
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -221,15 +220,11 @@ public class LocusIteratorByStateExperimental extends LocusIterator {
|
|||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public LocusIteratorByStateExperimental(final Iterator<SAMRecord> samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection<String> samples) {
|
||||
public LegacyLocusIteratorByState(final Iterator<SAMRecord> samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection<String> samples) {
|
||||
this.readInfo = readInformation;
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
this.samples = new ArrayList<String>(samples);
|
||||
this.readStates = new ReadStateManager(samIterator);
|
||||
|
||||
this.performLevelingDownsampling = readInfo.getDownsamplingMethod() != null &&
|
||||
readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE &&
|
||||
readInfo.getDownsamplingMethod().toCoverage != null;
|
||||
this.readStates = new ReadStateManager(samIterator, readInformation.getDownsamplingMethod());
|
||||
|
||||
// currently the GATK expects this LocusIteratorByState to accept empty sample lists, when
|
||||
// there's no read data. So we need to throw this error only when samIterator.hasNext() is true
|
||||
|
|
@ -290,13 +285,11 @@ public class LocusIteratorByStateExperimental extends LocusIterator {
|
|||
|
||||
final GenomeLoc location = getLocation();
|
||||
final Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
|
||||
|
||||
// TODO: How can you determine here whether the current pileup has been downsampled?
|
||||
boolean hasBeenSampled = false;
|
||||
|
||||
for (final String sample : samples) {
|
||||
final Iterator<SAMRecordState> iterator = readStates.iterator(sample);
|
||||
final List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
|
||||
hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample);
|
||||
|
||||
int size = 0; // number of elements in this sample's pileup
|
||||
int nDeletions = 0; // number of deletions in this sample's pileup
|
||||
|
|
@ -405,20 +398,34 @@ public class LocusIteratorByStateExperimental extends LocusIterator {
|
|||
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
|
||||
}
|
||||
|
||||
protected class ReadStateManager {
|
||||
private class ReadStateManager {
|
||||
private final PeekableIterator<SAMRecord> iterator;
|
||||
private final DownsamplingMethod downsamplingMethod;
|
||||
private final SamplePartitioner samplePartitioner;
|
||||
private final Map<String, PerSampleReadStateManager> readStatesBySample = new HashMap<String, PerSampleReadStateManager>();
|
||||
private final int targetCoverage;
|
||||
private int totalReadStates = 0;
|
||||
|
||||
public ReadStateManager(Iterator<SAMRecord> source) {
|
||||
public ReadStateManager(Iterator<SAMRecord> source, DownsamplingMethod downsamplingMethod) {
|
||||
this.iterator = new PeekableIterator<SAMRecord>(source);
|
||||
|
||||
for (final String sample : samples) {
|
||||
readStatesBySample.put(sample, new PerSampleReadStateManager());
|
||||
this.downsamplingMethod = downsamplingMethod.type != null ? downsamplingMethod : DownsamplingMethod.NONE;
|
||||
switch (this.downsamplingMethod.type) {
|
||||
case BY_SAMPLE:
|
||||
if (downsamplingMethod.toCoverage == null)
|
||||
throw new UserException.BadArgumentValue("dcov", "Downsampling coverage (-dcov) must be specified when downsampling by sample");
|
||||
this.targetCoverage = downsamplingMethod.toCoverage;
|
||||
break;
|
||||
default:
|
||||
this.targetCoverage = Integer.MAX_VALUE;
|
||||
}
|
||||
|
||||
samplePartitioner = new SamplePartitioner();
|
||||
Map<String, ReadSelector> readSelectors = new HashMap<String, ReadSelector>();
|
||||
for (final String sample : samples) {
|
||||
readStatesBySample.put(sample, new PerSampleReadStateManager());
|
||||
readSelectors.put(sample, downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null, targetCoverage) : new AllReadsSelector());
|
||||
}
|
||||
|
||||
samplePartitioner = new SamplePartitioner(readSelectors);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -442,6 +449,7 @@ public class LocusIteratorByStateExperimental extends LocusIterator {
|
|||
|
||||
public void remove() {
|
||||
wrappedIterator.remove();
|
||||
totalReadStates--;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
@ -469,6 +477,17 @@ public class LocusIteratorByStateExperimental extends LocusIterator {
|
|||
return readStatesBySample.get(sample).size();
|
||||
}
|
||||
|
||||
/**
|
||||
* The extent of downsampling; basically, the furthest base out which has 'fallen
|
||||
* victim' to the downsampler.
|
||||
*
|
||||
* @param sample Sample, downsampled independently.
|
||||
* @return Integer stop of the furthest undownsampled region.
|
||||
*/
|
||||
public int getDownsamplingExtent(final String sample) {
|
||||
return readStatesBySample.get(sample).getDownsamplingExtent();
|
||||
}
|
||||
|
||||
public SAMRecordState getFirst() {
|
||||
for (final String sample : samples) {
|
||||
PerSampleReadStateManager reads = readStatesBySample.get(sample);
|
||||
|
|
@ -501,13 +520,61 @@ public class LocusIteratorByStateExperimental extends LocusIterator {
|
|||
samplePartitioner.submitRead(iterator.next());
|
||||
}
|
||||
}
|
||||
samplePartitioner.complete();
|
||||
|
||||
for (final String sample : samples) {
|
||||
Collection<SAMRecord> newReads = samplePartitioner.getReadsForSample(sample);
|
||||
PerSampleReadStateManager statesBySample = readStatesBySample.get(sample);
|
||||
addReadsToSample(statesBySample, newReads);
|
||||
}
|
||||
ReadSelector aggregator = samplePartitioner.getSelectedReads(sample);
|
||||
|
||||
Collection<SAMRecord> newReads = new ArrayList<SAMRecord>(aggregator.getSelectedReads());
|
||||
|
||||
PerSampleReadStateManager statesBySample = readStatesBySample.get(sample);
|
||||
int numReads = statesBySample.size();
|
||||
int downsamplingExtent = aggregator.getDownsamplingExtent();
|
||||
|
||||
if (numReads + newReads.size() <= targetCoverage || downsamplingMethod.type == DownsampleType.NONE) {
|
||||
long readLimit = aggregator.getNumReadsSeen();
|
||||
addReadsToSample(statesBySample, newReads, readLimit);
|
||||
statesBySample.specifyNewDownsamplingExtent(downsamplingExtent);
|
||||
} else {
|
||||
int[] counts = statesBySample.getCountsPerAlignmentStart();
|
||||
int[] updatedCounts = new int[counts.length];
|
||||
System.arraycopy(counts, 0, updatedCounts, 0, counts.length);
|
||||
|
||||
boolean readPruned = true;
|
||||
while (numReads + newReads.size() > targetCoverage && readPruned) {
|
||||
readPruned = false;
|
||||
for (int alignmentStart = updatedCounts.length - 1; numReads + newReads.size() > targetCoverage && alignmentStart >= 0; alignmentStart--) {
|
||||
if (updatedCounts[alignmentStart] > 1) {
|
||||
updatedCounts[alignmentStart]--;
|
||||
numReads--;
|
||||
readPruned = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (numReads == targetCoverage) {
|
||||
updatedCounts[0]--;
|
||||
numReads--;
|
||||
}
|
||||
|
||||
BitSet toPurge = new BitSet(readStates.size());
|
||||
int readOffset = 0;
|
||||
|
||||
for (int i = 0; i < updatedCounts.length; i++) {
|
||||
int n = counts[i];
|
||||
int k = updatedCounts[i];
|
||||
|
||||
for (Integer purgedElement : MathUtils.sampleIndicesWithoutReplacement(n, n - k))
|
||||
toPurge.set(readOffset + purgedElement);
|
||||
|
||||
readOffset += counts[i];
|
||||
}
|
||||
downsamplingExtent = Math.max(downsamplingExtent, statesBySample.purge(toPurge));
|
||||
|
||||
addReadsToSample(statesBySample, newReads, targetCoverage - numReads);
|
||||
statesBySample.specifyNewDownsamplingExtent(downsamplingExtent);
|
||||
}
|
||||
}
|
||||
samplePartitioner.reset();
|
||||
}
|
||||
|
||||
|
|
@ -516,134 +583,380 @@ public class LocusIteratorByStateExperimental extends LocusIterator {
|
|||
*
|
||||
* @param readStates The list of read states to add this collection of reads.
|
||||
* @param reads Reads to add. Selected reads will be pulled from this source.
|
||||
* @param maxReads Maximum number of reads to add.
|
||||
*/
|
||||
private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection<SAMRecord> reads) {
|
||||
private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection<SAMRecord> reads, final long maxReads) {
|
||||
if (reads.isEmpty())
|
||||
return;
|
||||
|
||||
Collection<SAMRecordState> newReadStates = new LinkedList<SAMRecordState>();
|
||||
|
||||
int readCount = 0;
|
||||
for (SAMRecord read : reads) {
|
||||
SAMRecordState state = new SAMRecordState(read);
|
||||
state.stepForwardOnGenome();
|
||||
newReadStates.add(state);
|
||||
if (readCount < maxReads) {
|
||||
SAMRecordState state = new SAMRecordState(read);
|
||||
state.stepForwardOnGenome();
|
||||
newReadStates.add(state);
|
||||
readCount++;
|
||||
}
|
||||
}
|
||||
|
||||
readStates.addStatesAtNextAlignmentStart(newReadStates);
|
||||
}
|
||||
|
||||
protected class PerSampleReadStateManager implements Iterable<SAMRecordState> {
|
||||
private List<LinkedList<SAMRecordState>> readStatesByAlignmentStart = new LinkedList<LinkedList<SAMRecordState>>();
|
||||
private int thisSampleReadStates = 0;
|
||||
private Downsampler<LinkedList<SAMRecordState>> levelingDownsampler =
|
||||
performLevelingDownsampling ?
|
||||
new LevelingDownsampler<LinkedList<SAMRecordState>, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) :
|
||||
null;
|
||||
private class PerSampleReadStateManager implements Iterable<SAMRecordState> {
|
||||
private final Queue<SAMRecordState> readStates = new LinkedList<SAMRecordState>();
|
||||
private final Deque<Counter> readStateCounter = new LinkedList<Counter>();
|
||||
private int downsamplingExtent = 0;
|
||||
|
||||
public void addStatesAtNextAlignmentStart(Collection<SAMRecordState> states) {
|
||||
if ( states.isEmpty() ) {
|
||||
return;
|
||||
}
|
||||
|
||||
readStatesByAlignmentStart.add(new LinkedList<SAMRecordState>(states));
|
||||
thisSampleReadStates += states.size();
|
||||
readStates.addAll(states);
|
||||
readStateCounter.add(new Counter(states.size()));
|
||||
totalReadStates += states.size();
|
||||
|
||||
if ( levelingDownsampler != null ) {
|
||||
levelingDownsampler.submit(readStatesByAlignmentStart);
|
||||
levelingDownsampler.signalEndOfInput();
|
||||
|
||||
thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
|
||||
totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
|
||||
|
||||
// use returned List directly rather than make a copy, for efficiency's sake
|
||||
readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems();
|
||||
levelingDownsampler.reset();
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return readStatesByAlignmentStart.isEmpty();
|
||||
return readStates.isEmpty();
|
||||
}
|
||||
|
||||
public SAMRecordState peek() {
|
||||
return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek();
|
||||
return readStates.peek();
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return thisSampleReadStates;
|
||||
return readStates.size();
|
||||
}
|
||||
|
||||
public void specifyNewDownsamplingExtent(int downsamplingExtent) {
|
||||
this.downsamplingExtent = Math.max(this.downsamplingExtent, downsamplingExtent);
|
||||
}
|
||||
|
||||
public int getDownsamplingExtent() {
|
||||
return downsamplingExtent;
|
||||
}
|
||||
|
||||
public int[] getCountsPerAlignmentStart() {
|
||||
int[] counts = new int[readStateCounter.size()];
|
||||
int index = 0;
|
||||
for (Counter counter : readStateCounter)
|
||||
counts[index++] = counter.getCount();
|
||||
return counts;
|
||||
}
|
||||
|
||||
public Iterator<SAMRecordState> iterator() {
|
||||
return new Iterator<SAMRecordState>() {
|
||||
private Iterator<LinkedList<SAMRecordState>> alignmentStartIterator = readStatesByAlignmentStart.iterator();
|
||||
private LinkedList<SAMRecordState> currentPositionReadStates = null;
|
||||
private Iterator<SAMRecordState> currentPositionReadStatesIterator = null;
|
||||
private Iterator<SAMRecordState> wrappedIterator = readStates.iterator();
|
||||
|
||||
public boolean hasNext() {
|
||||
return alignmentStartIterator.hasNext() ||
|
||||
(currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext());
|
||||
return wrappedIterator.hasNext();
|
||||
}
|
||||
|
||||
public SAMRecordState next() {
|
||||
if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) {
|
||||
currentPositionReadStates = alignmentStartIterator.next();
|
||||
currentPositionReadStatesIterator = currentPositionReadStates.iterator();
|
||||
}
|
||||
|
||||
return currentPositionReadStatesIterator.next();
|
||||
return wrappedIterator.next();
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
currentPositionReadStatesIterator.remove();
|
||||
thisSampleReadStates--;
|
||||
totalReadStates--;
|
||||
|
||||
if ( currentPositionReadStates.isEmpty() ) {
|
||||
alignmentStartIterator.remove();
|
||||
}
|
||||
wrappedIterator.remove();
|
||||
Counter counter = readStateCounter.peek();
|
||||
counter.decrement();
|
||||
if (counter.getCount() == 0)
|
||||
readStateCounter.remove();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Purge the given elements from the bitset. If an element in the bitset is true, purge
|
||||
* the corresponding read state.
|
||||
*
|
||||
* @param elements bits from the set to purge.
|
||||
* @return the extent of the final downsampled read.
|
||||
*/
|
||||
public int purge(final BitSet elements) {
|
||||
int downsamplingExtent = 0;
|
||||
|
||||
if (elements.isEmpty() || readStates.isEmpty()) return downsamplingExtent;
|
||||
|
||||
Iterator<SAMRecordState> readStateIterator = readStates.iterator();
|
||||
|
||||
Iterator<Counter> counterIterator = readStateCounter.iterator();
|
||||
Counter currentCounter = counterIterator.next();
|
||||
|
||||
int readIndex = 0;
|
||||
long alignmentStartCounter = currentCounter.getCount();
|
||||
|
||||
int toPurge = elements.nextSetBit(0);
|
||||
int removedCount = 0;
|
||||
|
||||
while (readStateIterator.hasNext() && toPurge >= 0) {
|
||||
SAMRecordState state = readStateIterator.next();
|
||||
downsamplingExtent = Math.max(downsamplingExtent, state.getRead().getAlignmentEnd());
|
||||
|
||||
if (readIndex == toPurge) {
|
||||
readStateIterator.remove();
|
||||
currentCounter.decrement();
|
||||
if (currentCounter.getCount() == 0)
|
||||
counterIterator.remove();
|
||||
removedCount++;
|
||||
toPurge = elements.nextSetBit(toPurge + 1);
|
||||
}
|
||||
|
||||
readIndex++;
|
||||
alignmentStartCounter--;
|
||||
if (alignmentStartCounter == 0 && counterIterator.hasNext()) {
|
||||
currentCounter = counterIterator.next();
|
||||
alignmentStartCounter = currentCounter.getCount();
|
||||
}
|
||||
}
|
||||
|
||||
totalReadStates -= removedCount;
|
||||
|
||||
return downsamplingExtent;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Note: stores reads by sample ID string, not by sample object
|
||||
* Note: assuming that, whenever we downsample, we downsample to an integer capacity.
|
||||
*/
|
||||
private class SamplePartitioner {
|
||||
private Map<String, Collection<SAMRecord>> readsBySample;
|
||||
private long readsSeen = 0;
|
||||
static private class Counter {
|
||||
private int count;
|
||||
|
||||
public SamplePartitioner() {
|
||||
readsBySample = new HashMap<String, Collection<SAMRecord>>();
|
||||
|
||||
for ( String sample : samples ) {
|
||||
readsBySample.put(sample, new ArrayList<SAMRecord>());
|
||||
}
|
||||
public Counter(int count) {
|
||||
this.count = count;
|
||||
}
|
||||
|
||||
public void submitRead(SAMRecord read) {
|
||||
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
|
||||
if (readsBySample.containsKey(sampleName))
|
||||
readsBySample.get(sampleName).add(read);
|
||||
readsSeen++;
|
||||
public int getCount() {
|
||||
return count;
|
||||
}
|
||||
|
||||
public long getNumReadsSeen() {
|
||||
return readsSeen;
|
||||
}
|
||||
|
||||
public Collection<SAMRecord> getReadsForSample(String sampleName) {
|
||||
if ( ! readsBySample.containsKey(sampleName) )
|
||||
throw new NoSuchElementException("Sample name not found");
|
||||
return readsBySample.get(sampleName);
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
for ( Collection<SAMRecord> perSampleReads : readsBySample.values() )
|
||||
perSampleReads.clear();
|
||||
readsSeen = 0;
|
||||
public void decrement() {
|
||||
count--;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Selects reads passed to it based on a criteria decided through inheritance.
|
||||
* TODO: This is a temporary abstraction until we can get rid of this downsampling implementation and the mrl option. Get rid of this.
|
||||
*/
|
||||
interface ReadSelector {
|
||||
/**
|
||||
* All previous selectors in the chain have allowed this read. Submit it to this selector for consideration.
|
||||
*
|
||||
* @param read the read to evaluate.
|
||||
*/
|
||||
public void submitRead(SAMRecord read);
|
||||
|
||||
/**
|
||||
* A previous selector has deemed this read unfit. Notify this selector so that this selector's counts are valid.
|
||||
*
|
||||
* @param read the read previously rejected.
|
||||
*/
|
||||
public void notifyReadRejected(SAMRecord read);
|
||||
|
||||
/**
|
||||
* Signal the selector that read additions are complete.
|
||||
*/
|
||||
public void complete();
|
||||
|
||||
/**
|
||||
* Retrieve the number of reads seen by this selector so far.
|
||||
*
|
||||
* @return number of reads seen.
|
||||
*/
|
||||
public long getNumReadsSeen();
|
||||
|
||||
/**
|
||||
* Return the number of reads accepted by this selector so far.
|
||||
*
|
||||
* @return number of reads selected.
|
||||
*/
|
||||
public long getNumReadsSelected();
|
||||
|
||||
/**
|
||||
* Gets the locus at which the last of the downsampled reads selected by this selector ends. The value returned will be the
|
||||
* last aligned position from this selection to which a downsampled read aligns -- in other words, if a read is thrown out at
|
||||
* position 3 whose cigar string is 76M, the value of this parameter will be 78.
|
||||
*
|
||||
* @return If any read has been downsampled, this will return the last aligned base of the longest alignment. Else, 0.
|
||||
*/
|
||||
public int getDownsamplingExtent();
|
||||
|
||||
/**
|
||||
* Get the reads selected by this selector.
|
||||
*
|
||||
* @return collection of reads selected by this selector.
|
||||
*/
|
||||
public Collection<SAMRecord> getSelectedReads();
|
||||
|
||||
/**
|
||||
* Reset this collection to its pre-gathered state.
|
||||
*/
|
||||
public void reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* Select every read passed in.
|
||||
*/
|
||||
class AllReadsSelector implements ReadSelector {
|
||||
private Collection<SAMRecord> reads = new LinkedList<SAMRecord>();
|
||||
private long readsSeen = 0;
|
||||
private int downsamplingExtent = 0;
|
||||
|
||||
public void submitRead(SAMRecord read) {
|
||||
reads.add(read);
|
||||
readsSeen++;
|
||||
}
|
||||
|
||||
public void notifyReadRejected(SAMRecord read) {
|
||||
readsSeen++;
|
||||
downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd());
|
||||
}
|
||||
|
||||
public void complete() {
|
||||
// NO-OP.
|
||||
}
|
||||
|
||||
public long getNumReadsSeen() {
|
||||
return readsSeen;
|
||||
}
|
||||
|
||||
public long getNumReadsSelected() {
|
||||
return readsSeen;
|
||||
}
|
||||
|
||||
public int getDownsamplingExtent() {
|
||||
return downsamplingExtent;
|
||||
}
|
||||
|
||||
public Collection<SAMRecord> getSelectedReads() {
|
||||
return reads;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
reads.clear();
|
||||
readsSeen = 0;
|
||||
downsamplingExtent = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Select N reads randomly from the input stream.
|
||||
*/
|
||||
class NRandomReadSelector implements ReadSelector {
|
||||
private final LegacyReservoirDownsampler<SAMRecord> reservoir;
|
||||
private final ReadSelector chainedSelector;
|
||||
private long readsSeen = 0;
|
||||
private int downsamplingExtent = 0;
|
||||
|
||||
public NRandomReadSelector(ReadSelector chainedSelector, long readLimit) {
|
||||
this.reservoir = new LegacyReservoirDownsampler<SAMRecord>((int) readLimit);
|
||||
this.chainedSelector = chainedSelector;
|
||||
}
|
||||
|
||||
public void submitRead(SAMRecord read) {
|
||||
SAMRecord displaced = reservoir.add(read);
|
||||
if (displaced != null && chainedSelector != null) {
|
||||
chainedSelector.notifyReadRejected(read);
|
||||
downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd());
|
||||
}
|
||||
readsSeen++;
|
||||
}
|
||||
|
||||
public void notifyReadRejected(SAMRecord read) {
|
||||
readsSeen++;
|
||||
}
|
||||
|
||||
public void complete() {
|
||||
for (SAMRecord read : reservoir.getDownsampledContents())
|
||||
chainedSelector.submitRead(read);
|
||||
if (chainedSelector != null)
|
||||
chainedSelector.complete();
|
||||
}
|
||||
|
||||
|
||||
public long getNumReadsSeen() {
|
||||
return readsSeen;
|
||||
}
|
||||
|
||||
public long getNumReadsSelected() {
|
||||
return reservoir.size();
|
||||
}
|
||||
|
||||
public int getDownsamplingExtent() {
|
||||
return downsamplingExtent;
|
||||
}
|
||||
|
||||
public Collection<SAMRecord> getSelectedReads() {
|
||||
return reservoir.getDownsampledContents();
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
reservoir.clear();
|
||||
downsamplingExtent = 0;
|
||||
if (chainedSelector != null)
|
||||
chainedSelector.reset();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Note: stores reads by sample ID string, not by sample object
|
||||
*/
|
||||
class SamplePartitioner implements ReadSelector {
|
||||
private final Map<String, ReadSelector> readsBySample;
|
||||
private long readsSeen = 0;
|
||||
|
||||
public SamplePartitioner(Map<String, ReadSelector> readSelectors) {
|
||||
readsBySample = readSelectors;
|
||||
}
|
||||
|
||||
public void submitRead(SAMRecord read) {
|
||||
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
|
||||
if (readsBySample.containsKey(sampleName))
|
||||
readsBySample.get(sampleName).submitRead(read);
|
||||
readsSeen++;
|
||||
}
|
||||
|
||||
public void notifyReadRejected(SAMRecord read) {
|
||||
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
|
||||
if (readsBySample.containsKey(sampleName))
|
||||
readsBySample.get(sampleName).notifyReadRejected(read);
|
||||
readsSeen++;
|
||||
}
|
||||
|
||||
public void complete() {
|
||||
// NO-OP.
|
||||
}
|
||||
|
||||
public long getNumReadsSeen() {
|
||||
return readsSeen;
|
||||
}
|
||||
|
||||
public long getNumReadsSelected() {
|
||||
return readsSeen;
|
||||
}
|
||||
|
||||
public int getDownsamplingExtent() {
|
||||
int downsamplingExtent = 0;
|
||||
for (ReadSelector storage : readsBySample.values())
|
||||
downsamplingExtent = Math.max(downsamplingExtent, storage.getDownsamplingExtent());
|
||||
return downsamplingExtent;
|
||||
}
|
||||
|
||||
public Collection<SAMRecord> getSelectedReads() {
|
||||
throw new UnsupportedOperationException("Cannot directly get selected reads from a read partitioner.");
|
||||
}
|
||||
|
||||
public ReadSelector getSelectedReads(String sampleName) {
|
||||
if (!readsBySample.containsKey(sampleName))
|
||||
throw new NoSuchElementException("Sample name not found");
|
||||
return readsBySample.get(sampleName);
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
for (ReadSelector storage : readsBySample.values())
|
||||
storage.reset();
|
||||
readsSeen = 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -31,14 +31,11 @@ import net.sf.samtools.CigarElement;
|
|||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.ReadProperties;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.downsampling.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.ReservoirDownsampler;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
|
|
@ -54,7 +51,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
/**
|
||||
* our log, which we want to capture anything from this class
|
||||
*/
|
||||
private static Logger logger = Logger.getLogger(LocusIteratorByState.class);
|
||||
private static Logger logger = Logger.getLogger(LegacyLocusIteratorByState.class);
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -69,7 +66,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
private final ArrayList<String> samples;
|
||||
private final ReadStateManager readStates;
|
||||
|
||||
static private class SAMRecordState {
|
||||
protected static class SAMRecordState {
|
||||
SAMRecord read;
|
||||
int readOffset = -1; // how far are we offset from the start of the read bases?
|
||||
int genomeOffset = -1; // how far are we offset from the alignment start on the genome?
|
||||
|
|
@ -213,6 +210,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
//final boolean DEBUG2 = false && DEBUG;
|
||||
private ReadProperties readInfo;
|
||||
private AlignmentContext nextAlignmentContext;
|
||||
private boolean performDownsampling;
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -224,7 +222,18 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
this.readInfo = readInformation;
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
this.samples = new ArrayList<String>(samples);
|
||||
this.readStates = new ReadStateManager(samIterator, readInformation.getDownsamplingMethod());
|
||||
|
||||
// LIBS will invoke the Reservoir and Leveling downsamplers on the read stream if we're
|
||||
// downsampling to coverage by sample. SAMDataSource will have refrained from applying
|
||||
// any downsamplers to the read stream in this case, in the expectation that LIBS will
|
||||
// manage the downsampling. The reason for this is twofold: performance (don't have to
|
||||
// split/re-assemble the read stream in SAMDataSource), and to enable partial downsampling
|
||||
// of reads (eg., using half of a read, and throwing the rest away).
|
||||
this.performDownsampling = readInfo.getDownsamplingMethod() != null &&
|
||||
readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE &&
|
||||
readInfo.getDownsamplingMethod().toCoverage != null;
|
||||
|
||||
this.readStates = new ReadStateManager(samIterator);
|
||||
|
||||
// currently the GATK expects this LocusIteratorByState to accept empty sample lists, when
|
||||
// there's no read data. So we need to throw this error only when samIterator.hasNext() is true
|
||||
|
|
@ -285,11 +294,13 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
|
||||
final GenomeLoc location = getLocation();
|
||||
final Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
|
||||
|
||||
// TODO: How can you determine here whether the current pileup has been downsampled?
|
||||
boolean hasBeenSampled = false;
|
||||
|
||||
for (final String sample : samples) {
|
||||
final Iterator<SAMRecordState> iterator = readStates.iterator(sample);
|
||||
final List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
|
||||
hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample);
|
||||
|
||||
int size = 0; // number of elements in this sample's pileup
|
||||
int nDeletions = 0; // number of deletions in this sample's pileup
|
||||
|
|
@ -398,34 +409,20 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
|
||||
}
|
||||
|
||||
private class ReadStateManager {
|
||||
protected class ReadStateManager {
|
||||
private final PeekableIterator<SAMRecord> iterator;
|
||||
private final DownsamplingMethod downsamplingMethod;
|
||||
private final SamplePartitioner samplePartitioner;
|
||||
private final Map<String, PerSampleReadStateManager> readStatesBySample = new HashMap<String, PerSampleReadStateManager>();
|
||||
private final int targetCoverage;
|
||||
private int totalReadStates = 0;
|
||||
|
||||
public ReadStateManager(Iterator<SAMRecord> source, DownsamplingMethod downsamplingMethod) {
|
||||
public ReadStateManager(Iterator<SAMRecord> source) {
|
||||
this.iterator = new PeekableIterator<SAMRecord>(source);
|
||||
this.downsamplingMethod = downsamplingMethod.type != null ? downsamplingMethod : DownsamplingMethod.NONE;
|
||||
switch (this.downsamplingMethod.type) {
|
||||
case BY_SAMPLE:
|
||||
if (downsamplingMethod.toCoverage == null)
|
||||
throw new UserException.BadArgumentValue("dcov", "Downsampling coverage (-dcov) must be specified when downsampling by sample");
|
||||
this.targetCoverage = downsamplingMethod.toCoverage;
|
||||
break;
|
||||
default:
|
||||
this.targetCoverage = Integer.MAX_VALUE;
|
||||
}
|
||||
|
||||
Map<String, ReadSelector> readSelectors = new HashMap<String, ReadSelector>();
|
||||
for (final String sample : samples) {
|
||||
readStatesBySample.put(sample, new PerSampleReadStateManager());
|
||||
readSelectors.put(sample, downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null, targetCoverage) : new AllReadsSelector());
|
||||
}
|
||||
|
||||
samplePartitioner = new SamplePartitioner(readSelectors);
|
||||
samplePartitioner = new SamplePartitioner(performDownsampling);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -449,7 +446,6 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
|
||||
public void remove() {
|
||||
wrappedIterator.remove();
|
||||
totalReadStates--;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
@ -477,17 +473,6 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
return readStatesBySample.get(sample).size();
|
||||
}
|
||||
|
||||
/**
|
||||
* The extent of downsampling; basically, the furthest base out which has 'fallen
|
||||
* victim' to the downsampler.
|
||||
*
|
||||
* @param sample Sample, downsampled independently.
|
||||
* @return Integer stop of the furthest undownsampled region.
|
||||
*/
|
||||
public int getDownsamplingExtent(final String sample) {
|
||||
return readStatesBySample.get(sample).getDownsamplingExtent();
|
||||
}
|
||||
|
||||
public SAMRecordState getFirst() {
|
||||
for (final String sample : samples) {
|
||||
PerSampleReadStateManager reads = readStatesBySample.get(sample);
|
||||
|
|
@ -520,61 +505,15 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
samplePartitioner.submitRead(iterator.next());
|
||||
}
|
||||
}
|
||||
samplePartitioner.complete();
|
||||
|
||||
samplePartitioner.doneSubmittingReads();
|
||||
|
||||
for (final String sample : samples) {
|
||||
ReadSelector aggregator = samplePartitioner.getSelectedReads(sample);
|
||||
|
||||
Collection<SAMRecord> newReads = new ArrayList<SAMRecord>(aggregator.getSelectedReads());
|
||||
|
||||
Collection<SAMRecord> newReads = samplePartitioner.getReadsForSample(sample);
|
||||
PerSampleReadStateManager statesBySample = readStatesBySample.get(sample);
|
||||
int numReads = statesBySample.size();
|
||||
int downsamplingExtent = aggregator.getDownsamplingExtent();
|
||||
|
||||
if (numReads + newReads.size() <= targetCoverage || downsamplingMethod.type == DownsampleType.NONE) {
|
||||
long readLimit = aggregator.getNumReadsSeen();
|
||||
addReadsToSample(statesBySample, newReads, readLimit);
|
||||
statesBySample.specifyNewDownsamplingExtent(downsamplingExtent);
|
||||
} else {
|
||||
int[] counts = statesBySample.getCountsPerAlignmentStart();
|
||||
int[] updatedCounts = new int[counts.length];
|
||||
System.arraycopy(counts, 0, updatedCounts, 0, counts.length);
|
||||
|
||||
boolean readPruned = true;
|
||||
while (numReads + newReads.size() > targetCoverage && readPruned) {
|
||||
readPruned = false;
|
||||
for (int alignmentStart = updatedCounts.length - 1; numReads + newReads.size() > targetCoverage && alignmentStart >= 0; alignmentStart--) {
|
||||
if (updatedCounts[alignmentStart] > 1) {
|
||||
updatedCounts[alignmentStart]--;
|
||||
numReads--;
|
||||
readPruned = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (numReads == targetCoverage) {
|
||||
updatedCounts[0]--;
|
||||
numReads--;
|
||||
}
|
||||
|
||||
BitSet toPurge = new BitSet(readStates.size());
|
||||
int readOffset = 0;
|
||||
|
||||
for (int i = 0; i < updatedCounts.length; i++) {
|
||||
int n = counts[i];
|
||||
int k = updatedCounts[i];
|
||||
|
||||
for (Integer purgedElement : MathUtils.sampleIndicesWithoutReplacement(n, n - k))
|
||||
toPurge.set(readOffset + purgedElement);
|
||||
|
||||
readOffset += counts[i];
|
||||
}
|
||||
downsamplingExtent = Math.max(downsamplingExtent, statesBySample.purge(toPurge));
|
||||
|
||||
addReadsToSample(statesBySample, newReads, targetCoverage - numReads);
|
||||
statesBySample.specifyNewDownsamplingExtent(downsamplingExtent);
|
||||
}
|
||||
addReadsToSample(statesBySample, newReads);
|
||||
}
|
||||
|
||||
samplePartitioner.reset();
|
||||
}
|
||||
|
||||
|
|
@ -583,380 +522,140 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
*
|
||||
* @param readStates The list of read states to add this collection of reads.
|
||||
* @param reads Reads to add. Selected reads will be pulled from this source.
|
||||
* @param maxReads Maximum number of reads to add.
|
||||
*/
|
||||
private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection<SAMRecord> reads, final long maxReads) {
|
||||
private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection<SAMRecord> reads) {
|
||||
if (reads.isEmpty())
|
||||
return;
|
||||
|
||||
Collection<SAMRecordState> newReadStates = new LinkedList<SAMRecordState>();
|
||||
int readCount = 0;
|
||||
|
||||
for (SAMRecord read : reads) {
|
||||
if (readCount < maxReads) {
|
||||
SAMRecordState state = new SAMRecordState(read);
|
||||
state.stepForwardOnGenome();
|
||||
newReadStates.add(state);
|
||||
readCount++;
|
||||
}
|
||||
SAMRecordState state = new SAMRecordState(read);
|
||||
state.stepForwardOnGenome();
|
||||
newReadStates.add(state);
|
||||
}
|
||||
|
||||
readStates.addStatesAtNextAlignmentStart(newReadStates);
|
||||
}
|
||||
|
||||
private class PerSampleReadStateManager implements Iterable<SAMRecordState> {
|
||||
private final Queue<SAMRecordState> readStates = new LinkedList<SAMRecordState>();
|
||||
private final Deque<Counter> readStateCounter = new LinkedList<Counter>();
|
||||
private int downsamplingExtent = 0;
|
||||
protected class PerSampleReadStateManager implements Iterable<SAMRecordState> {
|
||||
private List<LinkedList<SAMRecordState>> readStatesByAlignmentStart = new LinkedList<LinkedList<SAMRecordState>>();
|
||||
private int thisSampleReadStates = 0;
|
||||
private Downsampler<LinkedList<SAMRecordState>> levelingDownsampler =
|
||||
performDownsampling ?
|
||||
new LevelingDownsampler<LinkedList<SAMRecordState>, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) :
|
||||
null;
|
||||
|
||||
public void addStatesAtNextAlignmentStart(Collection<SAMRecordState> states) {
|
||||
readStates.addAll(states);
|
||||
readStateCounter.add(new Counter(states.size()));
|
||||
if ( states.isEmpty() ) {
|
||||
return;
|
||||
}
|
||||
|
||||
readStatesByAlignmentStart.add(new LinkedList<SAMRecordState>(states));
|
||||
thisSampleReadStates += states.size();
|
||||
totalReadStates += states.size();
|
||||
|
||||
if ( levelingDownsampler != null ) {
|
||||
levelingDownsampler.submit(readStatesByAlignmentStart);
|
||||
levelingDownsampler.signalEndOfInput();
|
||||
|
||||
thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
|
||||
totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
|
||||
|
||||
// use returned List directly rather than make a copy, for efficiency's sake
|
||||
readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems();
|
||||
levelingDownsampler.reset();
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return readStates.isEmpty();
|
||||
return readStatesByAlignmentStart.isEmpty();
|
||||
}
|
||||
|
||||
public SAMRecordState peek() {
|
||||
return readStates.peek();
|
||||
return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek();
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return readStates.size();
|
||||
}
|
||||
|
||||
public void specifyNewDownsamplingExtent(int downsamplingExtent) {
|
||||
this.downsamplingExtent = Math.max(this.downsamplingExtent, downsamplingExtent);
|
||||
}
|
||||
|
||||
public int getDownsamplingExtent() {
|
||||
return downsamplingExtent;
|
||||
}
|
||||
|
||||
public int[] getCountsPerAlignmentStart() {
|
||||
int[] counts = new int[readStateCounter.size()];
|
||||
int index = 0;
|
||||
for (Counter counter : readStateCounter)
|
||||
counts[index++] = counter.getCount();
|
||||
return counts;
|
||||
return thisSampleReadStates;
|
||||
}
|
||||
|
||||
public Iterator<SAMRecordState> iterator() {
|
||||
return new Iterator<SAMRecordState>() {
|
||||
private Iterator<SAMRecordState> wrappedIterator = readStates.iterator();
|
||||
private Iterator<LinkedList<SAMRecordState>> alignmentStartIterator = readStatesByAlignmentStart.iterator();
|
||||
private LinkedList<SAMRecordState> currentPositionReadStates = null;
|
||||
private Iterator<SAMRecordState> currentPositionReadStatesIterator = null;
|
||||
|
||||
public boolean hasNext() {
|
||||
return wrappedIterator.hasNext();
|
||||
return alignmentStartIterator.hasNext() ||
|
||||
(currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext());
|
||||
}
|
||||
|
||||
public SAMRecordState next() {
|
||||
return wrappedIterator.next();
|
||||
if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) {
|
||||
currentPositionReadStates = alignmentStartIterator.next();
|
||||
currentPositionReadStatesIterator = currentPositionReadStates.iterator();
|
||||
}
|
||||
|
||||
return currentPositionReadStatesIterator.next();
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
wrappedIterator.remove();
|
||||
Counter counter = readStateCounter.peek();
|
||||
counter.decrement();
|
||||
if (counter.getCount() == 0)
|
||||
readStateCounter.remove();
|
||||
currentPositionReadStatesIterator.remove();
|
||||
thisSampleReadStates--;
|
||||
totalReadStates--;
|
||||
|
||||
if ( currentPositionReadStates.isEmpty() ) {
|
||||
alignmentStartIterator.remove();
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Purge the given elements from the bitset. If an element in the bitset is true, purge
|
||||
* the corresponding read state.
|
||||
*
|
||||
* @param elements bits from the set to purge.
|
||||
* @return the extent of the final downsampled read.
|
||||
*/
|
||||
public int purge(final BitSet elements) {
|
||||
int downsamplingExtent = 0;
|
||||
/**
|
||||
* Divides reads by sample and (if requested) does a preliminary downsampling pass with a ReservoirDownsampler.
|
||||
*
|
||||
* Note: stores reads by sample ID string, not by sample object
|
||||
*/
|
||||
private class SamplePartitioner {
|
||||
private Map<String, Downsampler<SAMRecord>> readsBySample;
|
||||
|
||||
if (elements.isEmpty() || readStates.isEmpty()) return downsamplingExtent;
|
||||
public SamplePartitioner( boolean downsampleReads ) {
|
||||
readsBySample = new HashMap<String, Downsampler<SAMRecord>>();
|
||||
|
||||
Iterator<SAMRecordState> readStateIterator = readStates.iterator();
|
||||
for ( String sample : samples ) {
|
||||
readsBySample.put(sample,
|
||||
downsampleReads ? new ReservoirDownsampler<SAMRecord>(readInfo.getDownsamplingMethod().toCoverage) :
|
||||
new PassThroughDownsampler<SAMRecord>());
|
||||
}
|
||||
}
|
||||
|
||||
Iterator<Counter> counterIterator = readStateCounter.iterator();
|
||||
Counter currentCounter = counterIterator.next();
|
||||
public void submitRead(SAMRecord read) {
|
||||
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
|
||||
if (readsBySample.containsKey(sampleName))
|
||||
readsBySample.get(sampleName).submit(read);
|
||||
}
|
||||
|
||||
int readIndex = 0;
|
||||
long alignmentStartCounter = currentCounter.getCount();
|
||||
public void doneSubmittingReads() {
|
||||
for ( Map.Entry<String, Downsampler<SAMRecord>> perSampleReads : readsBySample.entrySet() ) {
|
||||
perSampleReads.getValue().signalEndOfInput();
|
||||
}
|
||||
}
|
||||
|
||||
int toPurge = elements.nextSetBit(0);
|
||||
int removedCount = 0;
|
||||
public Collection<SAMRecord> getReadsForSample(String sampleName) {
|
||||
if ( ! readsBySample.containsKey(sampleName) )
|
||||
throw new NoSuchElementException("Sample name not found");
|
||||
|
||||
while (readStateIterator.hasNext() && toPurge >= 0) {
|
||||
SAMRecordState state = readStateIterator.next();
|
||||
downsamplingExtent = Math.max(downsamplingExtent, state.getRead().getAlignmentEnd());
|
||||
return readsBySample.get(sampleName).consumeFinalizedItems();
|
||||
}
|
||||
|
||||
if (readIndex == toPurge) {
|
||||
readStateIterator.remove();
|
||||
currentCounter.decrement();
|
||||
if (currentCounter.getCount() == 0)
|
||||
counterIterator.remove();
|
||||
removedCount++;
|
||||
toPurge = elements.nextSetBit(toPurge + 1);
|
||||
}
|
||||
|
||||
readIndex++;
|
||||
alignmentStartCounter--;
|
||||
if (alignmentStartCounter == 0 && counterIterator.hasNext()) {
|
||||
currentCounter = counterIterator.next();
|
||||
alignmentStartCounter = currentCounter.getCount();
|
||||
}
|
||||
}
|
||||
|
||||
totalReadStates -= removedCount;
|
||||
|
||||
return downsamplingExtent;
|
||||
public void reset() {
|
||||
for ( Map.Entry<String, Downsampler<SAMRecord>> perSampleReads : readsBySample.entrySet() ) {
|
||||
perSampleReads.getValue().clear();
|
||||
perSampleReads.getValue().reset();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Note: assuming that, whenever we downsample, we downsample to an integer capacity.
|
||||
*/
|
||||
static private class Counter {
|
||||
private int count;
|
||||
|
||||
public Counter(int count) {
|
||||
this.count = count;
|
||||
}
|
||||
|
||||
public int getCount() {
|
||||
return count;
|
||||
}
|
||||
|
||||
public void decrement() {
|
||||
count--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Selects reads passed to it based on a criteria decided through inheritance.
|
||||
* TODO: This is a temporary abstraction until we can get rid of this downsampling implementation and the mrl option. Get rid of this.
|
||||
*/
|
||||
interface ReadSelector {
|
||||
/**
|
||||
* All previous selectors in the chain have allowed this read. Submit it to this selector for consideration.
|
||||
*
|
||||
* @param read the read to evaluate.
|
||||
*/
|
||||
public void submitRead(SAMRecord read);
|
||||
|
||||
/**
|
||||
* A previous selector has deemed this read unfit. Notify this selector so that this selector's counts are valid.
|
||||
*
|
||||
* @param read the read previously rejected.
|
||||
*/
|
||||
public void notifyReadRejected(SAMRecord read);
|
||||
|
||||
/**
|
||||
* Signal the selector that read additions are complete.
|
||||
*/
|
||||
public void complete();
|
||||
|
||||
/**
|
||||
* Retrieve the number of reads seen by this selector so far.
|
||||
*
|
||||
* @return number of reads seen.
|
||||
*/
|
||||
public long getNumReadsSeen();
|
||||
|
||||
/**
|
||||
* Return the number of reads accepted by this selector so far.
|
||||
*
|
||||
* @return number of reads selected.
|
||||
*/
|
||||
public long getNumReadsSelected();
|
||||
|
||||
/**
|
||||
* Gets the locus at which the last of the downsampled reads selected by this selector ends. The value returned will be the
|
||||
* last aligned position from this selection to which a downsampled read aligns -- in other words, if a read is thrown out at
|
||||
* position 3 whose cigar string is 76M, the value of this parameter will be 78.
|
||||
*
|
||||
* @return If any read has been downsampled, this will return the last aligned base of the longest alignment. Else, 0.
|
||||
*/
|
||||
public int getDownsamplingExtent();
|
||||
|
||||
/**
|
||||
* Get the reads selected by this selector.
|
||||
*
|
||||
* @return collection of reads selected by this selector.
|
||||
*/
|
||||
public Collection<SAMRecord> getSelectedReads();
|
||||
|
||||
/**
|
||||
* Reset this collection to its pre-gathered state.
|
||||
*/
|
||||
public void reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* Select every read passed in.
|
||||
*/
|
||||
class AllReadsSelector implements ReadSelector {
|
||||
private Collection<SAMRecord> reads = new LinkedList<SAMRecord>();
|
||||
private long readsSeen = 0;
|
||||
private int downsamplingExtent = 0;
|
||||
|
||||
public void submitRead(SAMRecord read) {
|
||||
reads.add(read);
|
||||
readsSeen++;
|
||||
}
|
||||
|
||||
public void notifyReadRejected(SAMRecord read) {
|
||||
readsSeen++;
|
||||
downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd());
|
||||
}
|
||||
|
||||
public void complete() {
|
||||
// NO-OP.
|
||||
}
|
||||
|
||||
public long getNumReadsSeen() {
|
||||
return readsSeen;
|
||||
}
|
||||
|
||||
public long getNumReadsSelected() {
|
||||
return readsSeen;
|
||||
}
|
||||
|
||||
public int getDownsamplingExtent() {
|
||||
return downsamplingExtent;
|
||||
}
|
||||
|
||||
public Collection<SAMRecord> getSelectedReads() {
|
||||
return reads;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
reads.clear();
|
||||
readsSeen = 0;
|
||||
downsamplingExtent = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Select N reads randomly from the input stream.
|
||||
*/
|
||||
class NRandomReadSelector implements ReadSelector {
|
||||
private final ReservoirDownsampler<SAMRecord> reservoir;
|
||||
private final ReadSelector chainedSelector;
|
||||
private long readsSeen = 0;
|
||||
private int downsamplingExtent = 0;
|
||||
|
||||
public NRandomReadSelector(ReadSelector chainedSelector, long readLimit) {
|
||||
this.reservoir = new ReservoirDownsampler<SAMRecord>((int) readLimit);
|
||||
this.chainedSelector = chainedSelector;
|
||||
}
|
||||
|
||||
public void submitRead(SAMRecord read) {
|
||||
SAMRecord displaced = reservoir.add(read);
|
||||
if (displaced != null && chainedSelector != null) {
|
||||
chainedSelector.notifyReadRejected(read);
|
||||
downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd());
|
||||
}
|
||||
readsSeen++;
|
||||
}
|
||||
|
||||
public void notifyReadRejected(SAMRecord read) {
|
||||
readsSeen++;
|
||||
}
|
||||
|
||||
public void complete() {
|
||||
for (SAMRecord read : reservoir.getDownsampledContents())
|
||||
chainedSelector.submitRead(read);
|
||||
if (chainedSelector != null)
|
||||
chainedSelector.complete();
|
||||
}
|
||||
|
||||
|
||||
public long getNumReadsSeen() {
|
||||
return readsSeen;
|
||||
}
|
||||
|
||||
public long getNumReadsSelected() {
|
||||
return reservoir.size();
|
||||
}
|
||||
|
||||
public int getDownsamplingExtent() {
|
||||
return downsamplingExtent;
|
||||
}
|
||||
|
||||
public Collection<SAMRecord> getSelectedReads() {
|
||||
return reservoir.getDownsampledContents();
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
reservoir.clear();
|
||||
downsamplingExtent = 0;
|
||||
if (chainedSelector != null)
|
||||
chainedSelector.reset();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Note: stores reads by sample ID string, not by sample object
|
||||
*/
|
||||
class SamplePartitioner implements ReadSelector {
|
||||
private final Map<String, ReadSelector> readsBySample;
|
||||
private long readsSeen = 0;
|
||||
|
||||
public SamplePartitioner(Map<String, ReadSelector> readSelectors) {
|
||||
readsBySample = readSelectors;
|
||||
}
|
||||
|
||||
public void submitRead(SAMRecord read) {
|
||||
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
|
||||
if (readsBySample.containsKey(sampleName))
|
||||
readsBySample.get(sampleName).submitRead(read);
|
||||
readsSeen++;
|
||||
}
|
||||
|
||||
public void notifyReadRejected(SAMRecord read) {
|
||||
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
|
||||
if (readsBySample.containsKey(sampleName))
|
||||
readsBySample.get(sampleName).notifyReadRejected(read);
|
||||
readsSeen++;
|
||||
}
|
||||
|
||||
public void complete() {
|
||||
// NO-OP.
|
||||
}
|
||||
|
||||
public long getNumReadsSeen() {
|
||||
return readsSeen;
|
||||
}
|
||||
|
||||
public long getNumReadsSelected() {
|
||||
return readsSeen;
|
||||
}
|
||||
|
||||
public int getDownsamplingExtent() {
|
||||
int downsamplingExtent = 0;
|
||||
for (ReadSelector storage : readsBySample.values())
|
||||
downsamplingExtent = Math.max(downsamplingExtent, storage.getDownsamplingExtent());
|
||||
return downsamplingExtent;
|
||||
}
|
||||
|
||||
public Collection<SAMRecord> getSelectedReads() {
|
||||
throw new UnsupportedOperationException("Cannot directly get selected reads from a read partitioner.");
|
||||
}
|
||||
|
||||
public ReadSelector getSelectedReads(String sampleName) {
|
||||
if (!readsBySample.containsKey(sampleName))
|
||||
throw new NoSuchElementException("Sample name not found");
|
||||
return readsBySample.get(sampleName);
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
for (ReadSelector storage : readsBySample.values())
|
||||
storage.reset();
|
||||
readsSeen = 0;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -41,7 +41,7 @@ abstract public class ReadTransformer {
|
|||
protected ReadTransformer() {}
|
||||
|
||||
/**
|
||||
* Master initialization routine. Called to setup a ReadTransform, using it's overloaded initialialSub routine.
|
||||
* Master initialization routine. Called to setup a ReadTransform, using it's overloaded initializeSub routine.
|
||||
*
|
||||
* @param overrideTime if not null, we will run this ReadTransform at the time provided, regardless of the timing of this read transformer itself
|
||||
* @param engine the engine, for initializing values
|
||||
|
|
@ -59,7 +59,7 @@ abstract public class ReadTransformer {
|
|||
}
|
||||
|
||||
/**
|
||||
* Subclasses must override this to initialize themeselves
|
||||
* Subclasses must override this to initialize themselves
|
||||
*
|
||||
* @param engine the engine, for initializing values
|
||||
* @param walker the walker we intend to run
|
||||
|
|
|
|||
|
|
@ -271,7 +271,18 @@ public class GATKReport {
|
|||
* @return a simplified GATK report
|
||||
*/
|
||||
public static GATKReport newSimpleReport(final String tableName, final String... columns) {
|
||||
GATKReportTable table = new GATKReportTable(tableName, "A simplified GATK table report", columns.length);
|
||||
return newSimpleReportWithDescription(tableName, "A simplified GATK table report", columns);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #newSimpleReport(String, String...) but with a customized description
|
||||
* @param tableName
|
||||
* @param desc
|
||||
* @param columns
|
||||
* @return
|
||||
*/
|
||||
public static GATKReport newSimpleReportWithDescription(final String tableName, final String desc, final String... columns) {
|
||||
GATKReportTable table = new GATKReportTable(tableName, desc, columns.length);
|
||||
|
||||
for (String column : columns) {
|
||||
table.addColumn(column, "");
|
||||
|
|
@ -332,7 +343,7 @@ public class GATKReport {
|
|||
|
||||
GATKReportTable table = tables.firstEntry().getValue();
|
||||
if ( table.getNumColumns() != values.length )
|
||||
throw new ReviewedStingException("The number of arguments in writeRow() must match the number of columns in the table");
|
||||
throw new ReviewedStingException("The number of arguments in writeRow (" + values.length + ") must match the number of columns in the table (" + table.getNumColumns() + ")" );
|
||||
|
||||
final int rowIndex = table.getNumRows();
|
||||
for ( int i = 0; i < values.length; i++ )
|
||||
|
|
|
|||
|
|
@ -80,6 +80,9 @@ public enum GATKReportVersion {
|
|||
* @return The version as an enum.
|
||||
*/
|
||||
public static GATKReportVersion fromHeader(String header) {
|
||||
if ( header == null )
|
||||
throw new UserException.BadInput("The GATK report has no version specified in the header");
|
||||
|
||||
if (header.startsWith("##:GATKReport.v0.1 "))
|
||||
return GATKReportVersion.V0_1;
|
||||
|
||||
|
|
|
|||
|
|
@ -11,7 +11,6 @@ import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
|
|||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActivityProfile;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
|
||||
|
|
@ -46,99 +45,126 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
T sum) {
|
||||
logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider));
|
||||
|
||||
final LocusView locusView = getLocusView( walker, dataProvider );
|
||||
final GenomeLocSortedSet initialIntervals = engine.getIntervals();
|
||||
final LocusView locusView = new AllLocusView(dataProvider);
|
||||
|
||||
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
|
||||
final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
|
||||
final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion();
|
||||
|
||||
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
|
||||
int minStart = Integer.MAX_VALUE;
|
||||
ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
|
||||
int minStart = Integer.MAX_VALUE;
|
||||
final List<ActiveRegion> activeRegions = new LinkedList<ActiveRegion>();
|
||||
ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
|
||||
|
||||
ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
|
||||
ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
|
||||
|
||||
// We keep processing while the next reference location is within the interval
|
||||
GenomeLoc prevLoc = null;
|
||||
while( locusView.hasNext() ) {
|
||||
final AlignmentContext locus = locusView.next();
|
||||
GenomeLoc location = locus.getLocation();
|
||||
// We keep processing while the next reference location is within the interval
|
||||
GenomeLoc prevLoc = null;
|
||||
while( locusView.hasNext() ) {
|
||||
final AlignmentContext locus = locusView.next();
|
||||
final GenomeLoc location = locus.getLocation();
|
||||
|
||||
if(prevLoc != null) {
|
||||
// fill in the active / inactive labels from the stop of the previous location to the start of this location
|
||||
// TODO refactor to separate function
|
||||
for(int iii = prevLoc.getStop() + 1; iii < location.getStart(); iii++ ) {
|
||||
final GenomeLoc fakeLoc = engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii);
|
||||
if( initialIntervals == null || initialIntervals.overlaps( fakeLoc ) ) {
|
||||
profile.add(fakeLoc, new ActivityProfileResult( walker.hasPresetActiveRegions() && walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0 ));
|
||||
}
|
||||
}
|
||||
// Grab all the previously unseen reads from this pileup and add them to the massive read list
|
||||
// Note that this must occur before we leave because we are outside the intervals because
|
||||
// reads may occur outside our intervals but overlap them in the future
|
||||
// TODO -- this whole HashSet logic should be changed to a linked list of reads with
|
||||
// TODO -- subsequent pass over them to find the ones overlapping the active regions
|
||||
for( final PileupElement p : locus.getBasePileup() ) {
|
||||
final GATKSAMRecord read = p.getRead();
|
||||
if( !myReads.contains(read) ) {
|
||||
myReads.add(read);
|
||||
}
|
||||
|
||||
dataProvider.getShard().getReadMetrics().incrementNumIterations();
|
||||
|
||||
// create reference context. Note that if we have a pileup of "extended events", the context will
|
||||
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
|
||||
final ReferenceContext refContext = referenceView.getReferenceContext(location);
|
||||
|
||||
// Iterate forward to get all reference ordered data covering this location
|
||||
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
|
||||
|
||||
// Call the walkers isActive function for this locus and add them to the list to be integrated later
|
||||
if( initialIntervals == null || initialIntervals.overlaps( location ) ) {
|
||||
profile.add(location, walkerActiveProb(walker, tracker, refContext, locus, location));
|
||||
}
|
||||
|
||||
// Grab all the previously unseen reads from this pileup and add them to the massive read list
|
||||
for( final PileupElement p : locus.getBasePileup() ) {
|
||||
final GATKSAMRecord read = p.getRead();
|
||||
if( !myReads.contains(read) ) {
|
||||
myReads.add(read);
|
||||
}
|
||||
|
||||
// If this is the last pileup for this shard calculate the minimum alignment start so that we know
|
||||
// which active regions in the work queue are now safe to process
|
||||
minStart = Math.min(minStart, read.getAlignmentStart());
|
||||
}
|
||||
|
||||
prevLoc = location;
|
||||
|
||||
printProgress(locus.getLocation());
|
||||
// If this is the last pileup for this shard calculate the minimum alignment start so that we know
|
||||
// which active regions in the work queue are now safe to process
|
||||
minStart = Math.min(minStart, read.getAlignmentStart());
|
||||
}
|
||||
|
||||
updateCumulativeMetrics(dataProvider.getShard());
|
||||
// skip this location -- it's not part of our engine intervals
|
||||
if ( outsideEngineIntervals(location) )
|
||||
continue;
|
||||
|
||||
// Take the individual isActive calls and integrate them into contiguous active regions and
|
||||
// add these blocks of work to the work queue
|
||||
// band-pass filter the list of isActive probabilities and turn into active regions
|
||||
final ActivityProfile bandPassFiltered = profile.bandPassFilter();
|
||||
final List<ActiveRegion> activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize );
|
||||
|
||||
// add active regions to queue of regions to process
|
||||
// first check if can merge active regions over shard boundaries
|
||||
if( !activeRegions.isEmpty() ) {
|
||||
if( !workQueue.isEmpty() ) {
|
||||
final ActiveRegion last = workQueue.getLast();
|
||||
final ActiveRegion first = activeRegions.get(0);
|
||||
if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) {
|
||||
workQueue.removeLast();
|
||||
activeRegions.remove(first);
|
||||
workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) );
|
||||
}
|
||||
}
|
||||
workQueue.addAll( activeRegions );
|
||||
if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) {
|
||||
// we've move across some interval boundary, restart profile
|
||||
profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize);
|
||||
}
|
||||
|
||||
logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
|
||||
dataProvider.getShard().getReadMetrics().incrementNumIterations();
|
||||
|
||||
// now go and process all of the active regions
|
||||
sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig());
|
||||
// create reference context. Note that if we have a pileup of "extended events", the context will
|
||||
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
|
||||
final ReferenceContext refContext = referenceView.getReferenceContext(location);
|
||||
|
||||
// Iterate forward to get all reference ordered data covering this location
|
||||
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
|
||||
|
||||
// Call the walkers isActive function for this locus and add them to the list to be integrated later
|
||||
profile.add(walkerActiveProb(walker, tracker, refContext, locus, location));
|
||||
|
||||
prevLoc = location;
|
||||
|
||||
printProgress(locus.getLocation());
|
||||
}
|
||||
|
||||
updateCumulativeMetrics(dataProvider.getShard());
|
||||
|
||||
if ( ! profile.isEmpty() )
|
||||
incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize);
|
||||
|
||||
// add active regions to queue of regions to process
|
||||
// first check if can merge active regions over shard boundaries
|
||||
if( !activeRegions.isEmpty() ) {
|
||||
if( !workQueue.isEmpty() ) {
|
||||
final ActiveRegion last = workQueue.getLast();
|
||||
final ActiveRegion first = activeRegions.get(0);
|
||||
if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) {
|
||||
workQueue.removeLast();
|
||||
activeRegions.remove(first);
|
||||
workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) );
|
||||
}
|
||||
}
|
||||
workQueue.addAll( activeRegions );
|
||||
}
|
||||
|
||||
logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
|
||||
|
||||
// now go and process all of the active regions
|
||||
sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig());
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is the loc outside of the intervals being requested for processing by the GATK?
|
||||
* @param loc
|
||||
* @return
|
||||
*/
|
||||
private boolean outsideEngineIntervals(final GenomeLoc loc) {
|
||||
return engine.getIntervals() != null && ! engine.getIntervals().overlaps(loc);
|
||||
}
|
||||
|
||||
/**
|
||||
* Take the individual isActive calls and integrate them into contiguous active regions and
|
||||
* add these blocks of work to the work queue
|
||||
* band-pass filter the list of isActive probabilities and turn into active regions
|
||||
*
|
||||
* @param profile
|
||||
* @param activeRegions
|
||||
* @param activeRegionExtension
|
||||
* @param maxRegionSize
|
||||
* @return
|
||||
*/
|
||||
private ActivityProfile incorporateActiveRegions(final ActivityProfile profile,
|
||||
final List<ActiveRegion> activeRegions,
|
||||
final int activeRegionExtension,
|
||||
final int maxRegionSize) {
|
||||
if ( profile.isEmpty() )
|
||||
throw new IllegalStateException("trying to incorporate an empty active profile " + profile);
|
||||
|
||||
final ActivityProfile bandPassFiltered = profile.bandPassFilter();
|
||||
activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ));
|
||||
return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() );
|
||||
}
|
||||
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -150,7 +176,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
final RefMetaDataTracker tracker, final ReferenceContext refContext,
|
||||
final AlignmentContext locus, final GenomeLoc location) {
|
||||
if ( walker.hasPresetActiveRegions() ) {
|
||||
return new ActivityProfileResult(walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0);
|
||||
return new ActivityProfileResult(location, walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0);
|
||||
} else {
|
||||
return walker.isActive( tracker, refContext, locus );
|
||||
}
|
||||
|
|
@ -232,13 +258,23 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
activeRegion.add( read );
|
||||
}
|
||||
for( final ActiveRegion otherRegionToTest : workQueue ) {
|
||||
if( !bestRegion.equals(otherRegionToTest) && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) {
|
||||
otherRegionToTest.add( read );
|
||||
if( !bestRegion.equals(otherRegionToTest) ) {
|
||||
// check for non-primary vs. extended
|
||||
if ( otherRegionToTest.getLocation().overlapsP( readLoc ) ) {
|
||||
otherRegionToTest.add( read );
|
||||
} else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) {
|
||||
otherRegionToTest.add( read );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
placedReads.add( read );
|
||||
} else if( activeRegion.getExtendedLoc().overlapsP( readLoc ) && walker.wantsNonPrimaryReads() ) {
|
||||
// check for non-primary vs. extended
|
||||
} else if( activeRegion.getLocation().overlapsP( readLoc ) ) {
|
||||
if ( walker.wantsNonPrimaryReads() ) {
|
||||
activeRegion.add( read );
|
||||
}
|
||||
} else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) {
|
||||
activeRegion.add( read );
|
||||
}
|
||||
}
|
||||
|
|
@ -250,30 +286,6 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
return walker.reduce( x, sum );
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// engine interaction code
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Gets the best view of loci for this walker given the available data.
|
||||
* @param walker walker to interrogate.
|
||||
* @param dataProvider Data which which to drive the locus view.
|
||||
* @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal.
|
||||
*/
|
||||
private LocusView getLocusView( final Walker<M,T> walker, final LocusShardDataProvider dataProvider ) {
|
||||
final DataSource dataSource = WalkerManager.getWalkerDataSource(walker);
|
||||
if( dataSource == DataSource.READS )
|
||||
return new CoveredLocusView(dataProvider);
|
||||
else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers )
|
||||
return new AllLocusView(dataProvider);
|
||||
else if( dataSource == DataSource.REFERENCE_ORDERED_DATA )
|
||||
return new RodLocusView(dataProvider);
|
||||
else
|
||||
throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource);
|
||||
}
|
||||
|
||||
/**
|
||||
* Special function called in LinearMicroScheduler to empty out the work queue.
|
||||
* Ugly for now but will be cleaned up when we push this functionality more into the engine
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
package org.broadinstitute.sting.gatk.walkers;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
|
|
@ -13,14 +14,14 @@ import org.broadinstitute.sting.utils.GenomeLoc;
|
|||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Base class for all the Active Region Walkers.
|
||||
|
|
@ -70,11 +71,24 @@ public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<Map
|
|||
return true; // We are keeping all the reads
|
||||
}
|
||||
|
||||
public boolean wantsNonPrimaryReads() {
|
||||
return false;
|
||||
public EnumSet<ActiveRegionReadState> desiredReadStates() {
|
||||
return EnumSet.of(ActiveRegionReadState.PRIMARY);
|
||||
}
|
||||
|
||||
public final boolean wantsNonPrimaryReads() {
|
||||
return desiredReadStates().contains(ActiveRegionReadState.NONPRIMARY);
|
||||
}
|
||||
|
||||
public boolean wantsExtendedReads() {
|
||||
return desiredReadStates().contains(ActiveRegionReadState.EXTENDED);
|
||||
}
|
||||
|
||||
public boolean wantsUnmappedReads() {
|
||||
return desiredReadStates().contains(ActiveRegionReadState.UNMAPPED);
|
||||
}
|
||||
|
||||
// Determine probability of active status over the AlignmentContext
|
||||
@Ensures({"result.isActiveProb >= 0.0", "result.isActiveProb <= 1.0"})
|
||||
public abstract ActivityProfileResult isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context);
|
||||
|
||||
// Map over the ActiveRegion
|
||||
|
|
|
|||
|
|
@ -276,6 +276,12 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
|
||||
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
|
||||
for (PileupElement p : sample.getValue().getBasePileup()) {
|
||||
|
||||
// ignore reduced reads because they are always on the forward strand!
|
||||
// TODO -- when het compression is enabled in RR, we somehow need to allow those reads through into the Fisher test
|
||||
if ( p.getRead().isReducedRead() )
|
||||
continue;
|
||||
|
||||
if ( ! RankSumTest.isUsableBase(p, false) ) // ignore deletions
|
||||
continue;
|
||||
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn
|
|||
final List<Double> refQuals, final List<Double> altQuals) {
|
||||
|
||||
if (pileup != null && likelihoodMap == null) {
|
||||
// no per-read likelihoods available:
|
||||
// old UG snp-only path through the annotations
|
||||
for ( final PileupElement p : pileup ) {
|
||||
if ( isUsableBase(p) ) {
|
||||
if ( allAlleles.get(0).equals(Allele.create(p.getBase(), true)) ) {
|
||||
|
|
@ -43,14 +43,13 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn
|
|||
}
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : likelihoodMap.getLikelihoodReadMap().entrySet()) {
|
||||
final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
// BUGBUG: There needs to be a comparable isUsableBase check here
|
||||
if (a.isNoCall())
|
||||
continue; // read is non-informative
|
||||
if (a.isReference())
|
||||
refQuals.add((double)el.getKey().getMappingQuality());
|
||||
else if (allAlleles.contains(a))
|
||||
altQuals.add((double)el.getKey().getMappingQuality());
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
|
|||
ReadBackedPileup pileup = null;
|
||||
|
||||
|
||||
if (stratifiedContexts != null) {
|
||||
if (stratifiedContexts != null) { // the old UG SNP-only path through the annotations
|
||||
final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||
if ( context != null )
|
||||
pileup = context.getBasePileup();
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
|
|||
final List<Double> refQuals, final List<Double> altQuals) {
|
||||
|
||||
if (alleleLikelihoodMap == null) {
|
||||
// use fast SNP-based version if we don't have per-read allele likelihoods
|
||||
// use old UG SNP-based version if we don't have per-read allele likelihoods
|
||||
for ( final PileupElement p : pileup ) {
|
||||
if ( isUsableBase(p) ) {
|
||||
int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0);
|
||||
|
|
|
|||
|
|
@ -82,7 +82,7 @@ import java.util.*;
|
|||
@Allows(value={DataSource.READS, DataSource.REFERENCE})
|
||||
@Reference(window=@Window(start=-50,stop=50))
|
||||
@By(DataSource.REFERENCE)
|
||||
public class VariantAnnotator extends RodWalker<Integer, Integer> implements AnnotatorCompatible {
|
||||
public class VariantAnnotator extends RodWalker<Integer, Integer> implements AnnotatorCompatible, TreeReducible<Integer> {
|
||||
|
||||
@ArgumentCollection
|
||||
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||
|
|
@ -275,14 +275,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the number of loci processed to zero.
|
||||
*
|
||||
* @return 0
|
||||
*/
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
|
||||
/**
|
||||
* We want reads that span deletions
|
||||
*
|
||||
|
|
@ -323,15 +315,15 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Increment the number of loci processed.
|
||||
*
|
||||
* @param value result of the map.
|
||||
* @param sum accumulator for the reduce.
|
||||
* @return the new number of loci processed.
|
||||
*/
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return sum + value;
|
||||
@Override
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
@Override
|
||||
public Integer reduce(Integer value, Integer sum) { return value + sum; }
|
||||
|
||||
@Override
|
||||
public Integer treeReduce(Integer lhs, Integer rhs) {
|
||||
return lhs + rhs;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -277,8 +277,12 @@ public class VariantAnnotatorEngine {
|
|||
if ( expression.fieldName.equals("ID") ) {
|
||||
if ( vc.hasID() )
|
||||
infoAnnotations.put(expression.fullName, vc.getID());
|
||||
} else if (expression.fieldName.equals("ALT")) {
|
||||
infoAnnotations.put(expression.fullName, vc.getAlternateAllele(0).getDisplayString());
|
||||
|
||||
} else if ( vc.hasAttribute(expression.fieldName) ) {
|
||||
infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName));
|
||||
infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName));
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -75,8 +75,9 @@ public class RecalibrationArgumentCollection {
|
|||
|
||||
/**
|
||||
* If not provided, then a temporary file is created and then deleted upon completion.
|
||||
* For advanced users only.
|
||||
*/
|
||||
@Hidden
|
||||
@Advanced
|
||||
@Argument(fullName = "intermediate_csv_file", shortName = "intermediate", doc = "The intermediate csv file to create", required = false)
|
||||
public File RECAL_CSV_FILE = null;
|
||||
|
||||
|
|
@ -101,13 +102,10 @@ public class RecalibrationArgumentCollection {
|
|||
@Argument(fullName = "no_standard_covs", shortName = "noStandard", doc = "Do not use the standard set of covariates, but rather just the ones listed using the -cov argument", required = false)
|
||||
public boolean DO_NOT_USE_STANDARD_COVARIATES = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Debugging-only Arguments
|
||||
/////////////////////////////
|
||||
/**
|
||||
* This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option.
|
||||
*/
|
||||
@Hidden
|
||||
@Advanced
|
||||
@Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
|
||||
public boolean RUN_WITHOUT_DBSNP = false;
|
||||
|
||||
|
|
@ -138,6 +136,13 @@ public class RecalibrationArgumentCollection {
|
|||
@Argument(fullName = "indels_context_size", shortName = "ics", doc = "size of the k-mer context to be used for base insertions and deletions", required = false)
|
||||
public int INDELS_CONTEXT_SIZE = 3;
|
||||
|
||||
/**
|
||||
* The cycle covariate will generate an error if it encounters a cycle greater than this value.
|
||||
* This argument is ignored if the Cycle covariate is not used.
|
||||
*/
|
||||
@Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "the maximum cycle value permitted for the Cycle covariate", required = false)
|
||||
public int MAXIMUM_CYCLE_VALUE = 500;
|
||||
|
||||
/**
|
||||
* A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
|
||||
*/
|
||||
|
|
@ -175,9 +180,15 @@ public class RecalibrationArgumentCollection {
|
|||
@Argument(fullName = "binary_tag_name", shortName = "bintag", required = false, doc = "the binary tag covariate name if using it")
|
||||
public String BINARY_TAG_NAME = null;
|
||||
|
||||
|
||||
/////////////////////////////
|
||||
// Debugging-only Arguments
|
||||
/////////////////////////////
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
|
||||
public String DEFAULT_PLATFORM = null;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
|
||||
public String FORCE_PLATFORM = null;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,73 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.SortedSet;
|
||||
|
||||
/**
|
||||
* GenomeLocs are very useful objects to keep track of genomic locations and perform set operations
|
||||
* with them.
|
||||
*
|
||||
* However, GenomeLocs are bound to strict validation through the GenomeLocParser and cannot
|
||||
* be created easily for small tasks that do not require the rigors of the GenomeLocParser validation
|
||||
*
|
||||
* SimpleGenomeLoc is a simple utility to create GenomeLocs without going through the parser. Should
|
||||
* only be used outside of the engine.
|
||||
*
|
||||
* User: carneiro
|
||||
* Date: 10/16/12
|
||||
* Time: 2:07 PM
|
||||
*/
|
||||
public class SimpleGenomeLoc extends GenomeLoc {
|
||||
private boolean finished;
|
||||
|
||||
public SimpleGenomeLoc(String contigName, int contigIndex, int start, int stop, boolean finished) {
|
||||
super(contigName, contigIndex, start, stop);
|
||||
this.finished = finished;
|
||||
}
|
||||
|
||||
public boolean isFinished() {
|
||||
return finished;
|
||||
}
|
||||
|
||||
@Requires("a != null && b != null")
|
||||
public static SimpleGenomeLoc merge(SimpleGenomeLoc a, SimpleGenomeLoc b) throws ReviewedStingException {
|
||||
if(GenomeLoc.isUnmapped(a) || GenomeLoc.isUnmapped(b)) {
|
||||
throw new ReviewedStingException("Tried to merge unmapped genome locs");
|
||||
}
|
||||
|
||||
if (!(a.contiguousP(b))) {
|
||||
throw new ReviewedStingException("The two genome locs need to be contiguous");
|
||||
}
|
||||
|
||||
|
||||
return new SimpleGenomeLoc(a.getContig(), a.contigIndex,
|
||||
Math.min(a.getStart(), b.getStart()),
|
||||
Math.max(a.getStop(), b.getStop()),
|
||||
a.isFinished());
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges a list of *sorted* *contiguous* locs into one
|
||||
*
|
||||
* @param sortedLocs a sorted list of contiguous locs
|
||||
* @return one merged loc
|
||||
*/
|
||||
public static SimpleGenomeLoc merge(SortedSet<SimpleGenomeLoc> sortedLocs) {
|
||||
SimpleGenomeLoc previousLoc = null;
|
||||
for (SimpleGenomeLoc loc : sortedLocs) {
|
||||
if (loc.isUnmapped()) {
|
||||
throw new ReviewedStingException("Tried to merge unmapped genome locs");
|
||||
}
|
||||
if (previousLoc != null && !previousLoc.contiguousP(loc)) {
|
||||
throw new ReviewedStingException("The genome locs need to be contiguous");
|
||||
}
|
||||
previousLoc = loc;
|
||||
}
|
||||
SimpleGenomeLoc firstLoc = sortedLocs.first();
|
||||
SimpleGenomeLoc lastLoc = sortedLocs.last();
|
||||
return merge(firstLoc, lastLoc);
|
||||
}
|
||||
}
|
||||
|
|
@ -191,7 +191,7 @@ public class CallableLoci extends LocusWalker<CallableLoci.CallableBaseState, Ca
|
|||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "format", shortName = "format", doc = "Output format", required = false)
|
||||
OutputFormat outputFormat;
|
||||
OutputFormat outputFormat = OutputFormat.BED;
|
||||
|
||||
public enum OutputFormat {
|
||||
/**
|
||||
|
|
@ -297,7 +297,7 @@ public class CallableLoci extends LocusWalker<CallableLoci.CallableBaseState, Ca
|
|||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s %d %d %s", loc.getContig(), loc.getStart(), loc.getStop(), state);
|
||||
return String.format("%s\t%d\t%d\t%s", loc.getContig(), loc.getStart()-1, loc.getStop(), state);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -6,11 +6,10 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
|
||||
|
|
@ -20,6 +19,21 @@ import java.util.Map;
|
|||
*/
|
||||
public class CoverageUtils {
|
||||
|
||||
public enum CountPileupType {
|
||||
/**
|
||||
* Count all reads independently (even if from the same fragment).
|
||||
*/
|
||||
COUNT_READS,
|
||||
/**
|
||||
* Count all fragments (even if the reads that compose the fragment are not consistent at that base).
|
||||
*/
|
||||
COUNT_FRAGMENTS,
|
||||
/**
|
||||
* Count all fragments (but only if the reads that compose the fragment are consistent at that base).
|
||||
*/
|
||||
COUNT_FRAGMENTS_REQUIRE_SAME_BASE
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the counts of bases from reads with MAPQ > minMapQ and base quality > minBaseQ in the context
|
||||
* as an array of ints, indexed by the index fields of BaseUtils
|
||||
|
|
@ -64,10 +78,10 @@ public class CoverageUtils {
|
|||
}
|
||||
|
||||
public static Map<DoCOutputType.Partition,Map<String,int[]>>
|
||||
getBaseCountsByPartition(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, Collection<DoCOutputType.Partition> types) {
|
||||
getBaseCountsByPartition(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, CountPileupType countType, Collection<DoCOutputType.Partition> types) {
|
||||
|
||||
Map<DoCOutputType.Partition,Map<String,int[]>> countsByIDByType = new HashMap<DoCOutputType.Partition,Map<String,int[]>>();
|
||||
Map<SAMReadGroupRecord,int[]> countsByRG = getBaseCountsByReadGroup(context,minMapQ,maxMapQ,minBaseQ,maxBaseQ);
|
||||
Map<SAMReadGroupRecord,int[]> countsByRG = getBaseCountsByReadGroup(context,minMapQ,maxMapQ,minBaseQ,maxBaseQ,countType);
|
||||
for (DoCOutputType.Partition t : types ) {
|
||||
// iterate through the read group counts and build the type associations
|
||||
for ( Map.Entry<SAMReadGroupRecord,int[]> readGroupCountEntry : countsByRG.entrySet() ) {
|
||||
|
|
@ -95,31 +109,95 @@ public class CoverageUtils {
|
|||
}
|
||||
}
|
||||
|
||||
public static Map<SAMReadGroupRecord,int[]> getBaseCountsByReadGroup(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ) {
|
||||
public static Map<SAMReadGroupRecord,int[]> getBaseCountsByReadGroup(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, CountPileupType countType) {
|
||||
Map<SAMReadGroupRecord, int[]> countsByRG = new HashMap<SAMReadGroupRecord,int[]>();
|
||||
for ( PileupElement e : context.getBasePileup() ) {
|
||||
if ( e.getMappingQual() >= minMapQ && e.getMappingQual() <= maxMapQ && ( e.getQual() >= minBaseQ && e.getQual() <= maxBaseQ || e.isDeletion() ) ) {
|
||||
SAMReadGroupRecord readGroup = getReadGroup(e.getRead());
|
||||
if ( ! countsByRG.keySet().contains(readGroup) ) {
|
||||
countsByRG.put(readGroup,new int[6]);
|
||||
updateCounts(countsByRG.get(readGroup),e);
|
||||
} else {
|
||||
updateCounts(countsByRG.get(readGroup),e);
|
||||
|
||||
List<PileupElement> countPileup = new LinkedList<PileupElement>();
|
||||
FragmentCollection<PileupElement> fpile;
|
||||
|
||||
switch (countType) {
|
||||
|
||||
case COUNT_READS:
|
||||
for (PileupElement e : context.getBasePileup())
|
||||
if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ))
|
||||
countPileup.add(e);
|
||||
break;
|
||||
|
||||
case COUNT_FRAGMENTS: // ignore base identities and put in FIRST base that passes filters:
|
||||
fpile = context.getBasePileup().getStartSortedPileup().toFragments();
|
||||
|
||||
for (PileupElement e : fpile.getSingletonReads())
|
||||
if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ))
|
||||
countPileup.add(e);
|
||||
|
||||
for (List<PileupElement> overlappingPair : fpile.getOverlappingPairs()) {
|
||||
// iterate over all elements in fragment:
|
||||
for (PileupElement e : overlappingPair) {
|
||||
if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) {
|
||||
countPileup.add(e); // add the first passing element per fragment
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case COUNT_FRAGMENTS_REQUIRE_SAME_BASE:
|
||||
fpile = context.getBasePileup().getStartSortedPileup().toFragments();
|
||||
|
||||
for (PileupElement e : fpile.getSingletonReads())
|
||||
if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ))
|
||||
countPileup.add(e);
|
||||
|
||||
for (List<PileupElement> overlappingPair : fpile.getOverlappingPairs()) {
|
||||
PileupElement firstElem = null;
|
||||
PileupElement addElem = null;
|
||||
|
||||
// iterate over all elements in fragment:
|
||||
for (PileupElement e : overlappingPair) {
|
||||
if (firstElem == null)
|
||||
firstElem = e;
|
||||
else if (e.getBase() != firstElem.getBase()) {
|
||||
addElem = null;
|
||||
break;
|
||||
}
|
||||
|
||||
// will add the first passing element per base-consistent fragment:
|
||||
if (addElem == null && countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ))
|
||||
addElem = e;
|
||||
}
|
||||
|
||||
if (addElem != null)
|
||||
countPileup.add(addElem);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new UserException("Must use valid CountPileupType");
|
||||
}
|
||||
|
||||
for (PileupElement e : countPileup) {
|
||||
SAMReadGroupRecord readGroup = getReadGroup(e.getRead());
|
||||
if (!countsByRG.keySet().contains(readGroup))
|
||||
countsByRG.put(readGroup, new int[6]);
|
||||
|
||||
updateCounts(countsByRG.get(readGroup), e);
|
||||
}
|
||||
|
||||
return countsByRG;
|
||||
}
|
||||
|
||||
private static boolean countElement(PileupElement e, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ) {
|
||||
return (e.getMappingQual() >= minMapQ && e.getMappingQual() <= maxMapQ && ( e.getQual() >= minBaseQ && e.getQual() <= maxBaseQ || e.isDeletion() ));
|
||||
}
|
||||
|
||||
private static void updateCounts(int[] counts, PileupElement e) {
|
||||
if ( e.isDeletion() ) {
|
||||
counts[BaseUtils.DELETION_INDEX]++;
|
||||
counts[BaseUtils.DELETION_INDEX] += e.getRepresentativeCount();
|
||||
} else if ( BaseUtils.basesAreEqual((byte) 'N', e.getBase()) ) {
|
||||
counts[BaseUtils.NO_CALL_INDEX]++;
|
||||
counts[BaseUtils.NO_CALL_INDEX] += e.getRepresentativeCount();
|
||||
} else {
|
||||
try {
|
||||
counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())]++;
|
||||
counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())] += e.getRepresentativeCount();
|
||||
} catch (ArrayIndexOutOfBoundsException exc) {
|
||||
throw new ReviewedStingException("Expected a simple base, but actually received"+(char)e.getBase());
|
||||
}
|
||||
|
|
|
|||
|
|
@ -129,11 +129,15 @@ public class DepthOfCoverage extends LocusWalker<Map<DoCOutputType.Partition,Map
|
|||
int minMappingQuality = -1;
|
||||
@Argument(fullName = "maxMappingQuality", doc = "Maximum mapping quality of reads to count towards depth. Defaults to 2^31-1 (Integer.MAX_VALUE).", required = false)
|
||||
int maxMappingQuality = Integer.MAX_VALUE;
|
||||
|
||||
@Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to -1.", required = false)
|
||||
byte minBaseQuality = -1;
|
||||
@Argument(fullName = "maxBaseQuality", doc = "Maximum quality of bases to count towards depth. Defaults to 127 (Byte.MAX_VALUE).", required = false)
|
||||
byte maxBaseQuality = Byte.MAX_VALUE;
|
||||
|
||||
@Argument(fullName = "countType", doc = "How should overlapping reads from the same fragment be handled?", required = false)
|
||||
CoverageUtils.CountPileupType countType = CoverageUtils.CountPileupType.COUNT_READS;
|
||||
|
||||
/**
|
||||
* Instead of reporting depth, report the base pileup at each locus
|
||||
*/
|
||||
|
|
@ -373,7 +377,7 @@ public class DepthOfCoverage extends LocusWalker<Map<DoCOutputType.Partition,Map
|
|||
//System.out.printf("\t[log]\t%s",ref.getLocus());
|
||||
}
|
||||
|
||||
return CoverageUtils.getBaseCountsByPartition(context,minMappingQuality,maxMappingQuality,minBaseQuality,maxBaseQuality,partitionTypes);
|
||||
return CoverageUtils.getBaseCountsByPartition(context,minMappingQuality,maxMappingQuality,minBaseQuality,maxBaseQuality,countType,partitionTypes);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -57,7 +57,7 @@ public class FindCoveredIntervals extends ActiveRegionWalker<GenomeLoc, Long> {
|
|||
int depth = ThresHolder.DEFAULTS.getFilteredCoverage(context.getBasePileup());
|
||||
|
||||
// note the linear probability scale
|
||||
return new ActivityProfileResult(Math.min(depth / coverageThreshold, 1));
|
||||
return new ActivityProfileResult(ref.getLocus(), Math.min(depth / coverageThreshold, 1));
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -47,6 +47,12 @@ import java.util.List;
|
|||
* <p>
|
||||
* Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s).
|
||||
* Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'.
|
||||
*
|
||||
* The output format can be partially controlled using the provided command-line arguments.
|
||||
* Specify intervals with the usual -L argument to output only the reference bases within your intervals.
|
||||
* Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a
|
||||
* separate fasta sequence (named numerically in order).
|
||||
*
|
||||
* Several important notes:
|
||||
* 1) if there are multiple variants that start at a site, it chooses one of them randomly.
|
||||
* 2) when there are overlapping indels (but with different start positions) only the first will be chosen.
|
||||
|
|
|
|||
|
|
@ -59,8 +59,8 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
|
|||
public enum Model {
|
||||
SNP,
|
||||
INDEL,
|
||||
GeneralPloidySNP,
|
||||
GeneralPloidyINDEL,
|
||||
GENERALPLOIDYSNP,
|
||||
GENERALPLOIDYINDEL,
|
||||
BOTH
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -52,7 +52,7 @@ import java.util.*;
|
|||
|
||||
public class UnifiedGenotyperEngine {
|
||||
public static final String LOW_QUAL_FILTER_NAME = "LowQual";
|
||||
private static final String GPSTRING = "GeneralPloidy";
|
||||
private static final String GPSTRING = "GENERALPLOIDY";
|
||||
|
||||
public static final String NUMBER_OF_DISCOVERED_ALLELES_KEY = "NDA";
|
||||
|
||||
|
|
@ -79,6 +79,7 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
// the model used for calculating genotypes
|
||||
private ThreadLocal<Map<String, GenotypeLikelihoodsCalculationModel>> glcm = new ThreadLocal<Map<String, GenotypeLikelihoodsCalculationModel>>();
|
||||
private final List<GenotypeLikelihoodsCalculationModel.Model> modelsToUse = new ArrayList<GenotypeLikelihoodsCalculationModel.Model>(2);
|
||||
|
||||
// the model used for calculating p(non-ref)
|
||||
private ThreadLocal<AFCalc> afcm = new ThreadLocal<AFCalc>();
|
||||
|
|
@ -134,6 +135,8 @@ public class UnifiedGenotyperEngine {
|
|||
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY);
|
||||
|
||||
filter.add(LOW_QUAL_FILTER_NAME);
|
||||
|
||||
determineGLModelsToUse();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -190,6 +193,10 @@ public class UnifiedGenotyperEngine {
|
|||
final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap);
|
||||
if ( vc != null )
|
||||
results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap));
|
||||
// todo - uncomment if we want to also emit a null ref call (with no QUAL) if there's no evidence for REF and if EMIT_ALL_SITES is set
|
||||
// else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES)
|
||||
// results.add(generateEmptyContext(tracker, refContext, null, rawContext));
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -282,7 +289,7 @@ public class UnifiedGenotyperEngine {
|
|||
glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC));
|
||||
}
|
||||
|
||||
return glcm.get().get(model.name().toUpperCase()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser, perReadAlleleLikelihoodMap);
|
||||
return glcm.get().get(model.name()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser, perReadAlleleLikelihoodMap);
|
||||
}
|
||||
|
||||
private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, AlignmentContext rawContext) {
|
||||
|
|
@ -630,48 +637,51 @@ public class UnifiedGenotyperEngine {
|
|||
(UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && QualityUtils.phredScaleErrorRate(PofF) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING);
|
||||
}
|
||||
|
||||
private void determineGLModelsToUse() {
|
||||
|
||||
String modelPrefix = "";
|
||||
if ( !UAC.GLmodel.name().contains(GPSTRING) && UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY )
|
||||
modelPrefix = GPSTRING;
|
||||
|
||||
if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") ) {
|
||||
modelPrefix += UAC.GLmodel.name().toUpperCase().replaceAll("BOTH","");
|
||||
modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"SNP"));
|
||||
modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"INDEL"));
|
||||
}
|
||||
else {
|
||||
modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+UAC.GLmodel.name().toUpperCase()));
|
||||
}
|
||||
}
|
||||
|
||||
// decide whether we are currently processing SNPs, indels, neither, or both
|
||||
private List<GenotypeLikelihoodsCalculationModel.Model> getGLModelsToUse(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext refContext,
|
||||
final AlignmentContext rawContext) {
|
||||
|
||||
final List<GenotypeLikelihoodsCalculationModel.Model> models = new ArrayList<GenotypeLikelihoodsCalculationModel.Model>(2);
|
||||
String modelPrefix = "";
|
||||
if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") )
|
||||
modelPrefix = UAC.GLmodel.name().toUpperCase().replaceAll("BOTH","");
|
||||
if ( UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES )
|
||||
return modelsToUse;
|
||||
|
||||
if (!UAC.GLmodel.name().contains(GPSTRING) && UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY)
|
||||
modelPrefix = GPSTRING + modelPrefix;
|
||||
// if we're genotyping given alleles then we need to choose the model corresponding to the variant type requested
|
||||
final List<GenotypeLikelihoodsCalculationModel.Model> GGAmodel = new ArrayList<GenotypeLikelihoodsCalculationModel.Model>(1);
|
||||
final VariantContext vcInput = getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles);
|
||||
if ( vcInput == null )
|
||||
return GGAmodel; // no work to be done
|
||||
|
||||
// if we're genotyping given alleles and we have a requested SNP at this position, do SNP
|
||||
if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
final VariantContext vcInput = getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles);
|
||||
if ( vcInput == null )
|
||||
return models;
|
||||
|
||||
if ( vcInput.isSNP() ) {
|
||||
// ignore SNPs if the user chose INDEL mode only
|
||||
if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") || UAC.GLmodel.name().toUpperCase().contains("SNP") )
|
||||
models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"SNP"));
|
||||
}
|
||||
else if ( vcInput.isIndel() || vcInput.isMixed() ) {
|
||||
// ignore INDELs if the user chose SNP mode only
|
||||
if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") || UAC.GLmodel.name().toUpperCase().contains("INDEL") )
|
||||
models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"INDEL"));
|
||||
}
|
||||
// No support for other types yet
|
||||
if ( vcInput.isSNP() ) {
|
||||
// use the SNP model unless the user chose INDEL mode only
|
||||
if ( modelsToUse.size() == 2 || modelsToUse.get(0).name().endsWith("SNP") )
|
||||
GGAmodel.add(modelsToUse.get(0));
|
||||
}
|
||||
else {
|
||||
if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") ) {
|
||||
models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"SNP"));
|
||||
models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"INDEL"));
|
||||
}
|
||||
else {
|
||||
models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+UAC.GLmodel.name().toUpperCase()));
|
||||
}
|
||||
else if ( vcInput.isIndel() || vcInput.isMixed() ) {
|
||||
// use the INDEL model unless the user chose SNP mode only
|
||||
if ( modelsToUse.size() == 2 )
|
||||
GGAmodel.add(modelsToUse.get(1));
|
||||
else if ( modelsToUse.get(0).name().endsWith("INDEL") )
|
||||
GGAmodel.add(modelsToUse.get(0));
|
||||
}
|
||||
// No support for other types yet
|
||||
|
||||
return models;
|
||||
return GGAmodel;
|
||||
}
|
||||
|
||||
public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) {
|
||||
|
|
|
|||
|
|
@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
|||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
|
|
@ -52,7 +51,7 @@ public class AFCalcResult {
|
|||
private final double[] log10PriorsOfAC;
|
||||
private final double[] log10PosteriorsOfAC;
|
||||
|
||||
private final Map<Allele, Double> log10pNonRefByAllele;
|
||||
private final Map<Allele, Double> log10pRefByAllele;
|
||||
|
||||
/**
|
||||
* The AC values for all ALT alleles at the MLE
|
||||
|
|
@ -74,16 +73,16 @@ public class AFCalcResult {
|
|||
final List<Allele> allelesUsedInGenotyping,
|
||||
final double[] log10LikelihoodsOfAC,
|
||||
final double[] log10PriorsOfAC,
|
||||
final Map<Allele, Double> log10pNonRefByAllele) {
|
||||
final Map<Allele, Double> log10pRefByAllele) {
|
||||
if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.size() < 1 ) throw new IllegalArgumentException("allelesUsedInGenotyping must be non-null list of at least 1 value " + allelesUsedInGenotyping);
|
||||
if ( alleleCountsOfMLE == null ) throw new IllegalArgumentException("alleleCountsOfMLE cannot be null");
|
||||
if ( alleleCountsOfMLE.length != allelesUsedInGenotyping.size() - 1) throw new IllegalArgumentException("alleleCountsOfMLE.length " + alleleCountsOfMLE.length + " != allelesUsedInGenotyping.size() " + allelesUsedInGenotyping.size());
|
||||
if ( nEvaluations < 0 ) throw new IllegalArgumentException("nEvaluations must be >= 0 but saw " + nEvaluations);
|
||||
if ( log10LikelihoodsOfAC.length != 2 ) throw new IllegalArgumentException("log10LikelihoodsOfAC must have length equal 2");
|
||||
if ( log10PriorsOfAC.length != 2 ) throw new IllegalArgumentException("log10PriorsOfAC must have length equal 2");
|
||||
if ( log10pNonRefByAllele == null ) throw new IllegalArgumentException("log10pNonRefByAllele cannot be null");
|
||||
if ( log10pNonRefByAllele.size() != allelesUsedInGenotyping.size() - 1 ) throw new IllegalArgumentException("log10pNonRefByAllele has the wrong number of elements: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping);
|
||||
if ( ! allelesUsedInGenotyping.containsAll(log10pNonRefByAllele.keySet()) ) throw new IllegalArgumentException("log10pNonRefByAllele doesn't contain all of the alleles used in genotyping: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping);
|
||||
if ( log10pRefByAllele == null ) throw new IllegalArgumentException("log10pRefByAllele cannot be null");
|
||||
if ( log10pRefByAllele.size() != allelesUsedInGenotyping.size() - 1 ) throw new IllegalArgumentException("log10pRefByAllele has the wrong number of elements: log10pRefByAllele " + log10pRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping);
|
||||
if ( ! allelesUsedInGenotyping.containsAll(log10pRefByAllele.keySet()) ) throw new IllegalArgumentException("log10pRefByAllele doesn't contain all of the alleles used in genotyping: log10pRefByAllele " + log10pRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping);
|
||||
if ( ! MathUtils.goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC));
|
||||
if ( ! MathUtils.goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC));
|
||||
|
||||
|
|
@ -94,7 +93,7 @@ public class AFCalcResult {
|
|||
this.log10LikelihoodsOfAC = Arrays.copyOf(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES);
|
||||
this.log10PriorsOfAC = Arrays.copyOf(log10PriorsOfAC, LOG_10_ARRAY_SIZES);
|
||||
this.log10PosteriorsOfAC = computePosteriors(log10LikelihoodsOfAC, log10PriorsOfAC);
|
||||
this.log10pNonRefByAllele = new HashMap<Allele, Double>(log10pNonRefByAllele);
|
||||
this.log10pRefByAllele = new HashMap<Allele, Double>(log10pRefByAllele);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -104,7 +103,7 @@ public class AFCalcResult {
|
|||
* @return
|
||||
*/
|
||||
public AFCalcResult withNewPriors(final double[] log10PriorsOfAC) {
|
||||
return new AFCalcResult(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pNonRefByAllele);
|
||||
return new AFCalcResult(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pRefByAllele);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -219,7 +218,7 @@ public class AFCalcResult {
|
|||
public String toString() {
|
||||
final List<String> byAllele = new LinkedList<String>();
|
||||
for ( final Allele a : getAllelesUsedInGenotyping() )
|
||||
if ( a.isNonReference() ) byAllele.add(String.format("%s => MLE %d / posterior %.2f", a, getAlleleCountAtMLE(a), getLog10PosteriorOfAFGt0ForAllele(a)));
|
||||
if ( a.isNonReference() ) byAllele.add(String.format("%s => MLE %d / posterior %.2f", a, getAlleleCountAtMLE(a), getLog10PosteriorOfAFEq0ForAllele(a)));
|
||||
return String.format("AFCalc%n\t\tlog10PosteriorOfAFGT0=%.2f%n\t\t%s", getLog10LikelihoodOfAFGT0(), Utils.join("\n\t\t", byAllele));
|
||||
}
|
||||
|
||||
|
|
@ -231,13 +230,16 @@ public class AFCalcResult {
|
|||
* And that log10minPNonRef is -3.
|
||||
* We are considered polymorphic since 10^-5 < 10^-3 => -5 < -3
|
||||
*
|
||||
* Note that log10minPNonRef is really the minimum confidence, scaled as an error rate, so
|
||||
* if you want to be 99% confidence, then log10PNonRef should be log10(0.01) = -2.
|
||||
*
|
||||
* @param log10minPNonRef the log10 scaled min pr of being non-ref to be considered polymorphic
|
||||
*
|
||||
* @return true if there's enough confidence (relative to log10minPNonRef) to reject AF == 0
|
||||
*/
|
||||
@Requires("MathUtils.goodLog10Probability(log10minPNonRef)")
|
||||
public boolean isPolymorphic(final Allele allele, final double log10minPNonRef) {
|
||||
return getLog10PosteriorOfAFGt0ForAllele(allele) >= log10minPNonRef;
|
||||
return getLog10PosteriorOfAFEq0ForAllele(allele) < log10minPNonRef;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -245,7 +247,7 @@ public class AFCalcResult {
|
|||
*/
|
||||
public boolean isPolymorphicPhredScaledQual(final Allele allele, final double minPNonRefPhredScaledQual) {
|
||||
if ( minPNonRefPhredScaledQual < 0 ) throw new IllegalArgumentException("phredScaledQual " + minPNonRefPhredScaledQual + " < 0 ");
|
||||
final double log10Threshold = Math.log10(QualityUtils.qualToProb(minPNonRefPhredScaledQual));
|
||||
final double log10Threshold = minPNonRefPhredScaledQual / -10;
|
||||
return isPolymorphic(allele, log10Threshold);
|
||||
}
|
||||
|
||||
|
|
@ -263,7 +265,16 @@ public class AFCalcResult {
|
|||
}
|
||||
|
||||
/**
|
||||
* Returns the log10 probability that allele is segregating
|
||||
* Returns the log10 probability that allele is not segregating
|
||||
*
|
||||
* Note that this function is p not segregating so that we can store
|
||||
* internally the log10 value of AF == 0, which grows very quickly
|
||||
* negative and yet has sufficient resolution for high confidence tests.
|
||||
* For example, if log10pRef == -100, not an unreasonably high number,
|
||||
* if we tried to store log10pNonRef we'd be looking at 1 - 10^-100, which
|
||||
* quickly underflows to 1. So the logic here is backward from what
|
||||
* you really want (the p of segregating) but we do that for numerical
|
||||
* reasons
|
||||
*
|
||||
* Unlike the sites-level annotation, this calculation is specific to allele, and can be
|
||||
* used to separately determine how much evidence there is that allele is independently
|
||||
|
|
@ -272,11 +283,11 @@ public class AFCalcResult {
|
|||
* evidence for one allele but not so much for any other allele
|
||||
*
|
||||
* @param allele the allele we're interested in, must be in getAllelesUsedInGenotyping
|
||||
* @return the log10 probability that allele is segregating at this site
|
||||
* @return the log10 probability that allele is not segregating at this site
|
||||
*/
|
||||
@Ensures("MathUtils.goodLog10Probability(result)")
|
||||
public double getLog10PosteriorOfAFGt0ForAllele(final Allele allele) {
|
||||
final Double log10pNonRef = log10pNonRefByAllele.get(allele);
|
||||
public double getLog10PosteriorOfAFEq0ForAllele(final Allele allele) {
|
||||
final Double log10pNonRef = log10pRefByAllele.get(allele);
|
||||
if ( log10pNonRef == null ) throw new IllegalArgumentException("Unknown allele " + allele);
|
||||
return log10pNonRef;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -77,7 +77,7 @@ public class ExactCallLogger implements Cloneable {
|
|||
for ( final Allele allele : result.getAllelesUsedInGenotyping() ) {
|
||||
if ( allele.isNonReference() ) {
|
||||
printCallElement(vc, "MLE", allele, result.getAlleleCountAtMLE(allele));
|
||||
printCallElement(vc, "pNonRefByAllele", allele, result.getLog10PosteriorOfAFGt0ForAllele(allele));
|
||||
printCallElement(vc, "pRefByAllele", allele, result.getLog10PosteriorOfAFEq0ForAllele(allele));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -123,7 +123,7 @@ public class ExactCallLogger implements Cloneable {
|
|||
final double[] posteriors = new double[2];
|
||||
final double[] priors = MathUtils.normalizeFromLog10(new double[]{0.5, 0.5}, true);
|
||||
final List<Integer> mle = new ArrayList<Integer>();
|
||||
final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>();
|
||||
final Map<Allele, Double> log10pRefByAllele = new HashMap<Allele, Double>();
|
||||
long runtimeNano = -1;
|
||||
|
||||
GenomeLoc currentLoc = null;
|
||||
|
|
@ -148,7 +148,7 @@ public class ExactCallLogger implements Cloneable {
|
|||
builder.chr(currentLoc.getContig()).start(currentLoc.getStart()).stop(stop);
|
||||
builder.genotypes(genotypes);
|
||||
final int[] mleInts = ArrayUtils.toPrimitive(mle.toArray(new Integer[]{}));
|
||||
final AFCalcResult result = new AFCalcResult(mleInts, 1, alleles, posteriors, priors, log10pNonRefByAllele);
|
||||
final AFCalcResult result = new AFCalcResult(mleInts, 1, alleles, posteriors, priors, log10pRefByAllele);
|
||||
calls.add(new ExactCall(builder.make(), runtimeNano, result));
|
||||
}
|
||||
break;
|
||||
|
|
@ -165,9 +165,9 @@ public class ExactCallLogger implements Cloneable {
|
|||
posteriors[1] = Double.valueOf(value);
|
||||
} else if (variable.equals("MLE")) {
|
||||
mle.add(Integer.valueOf(value));
|
||||
} else if (variable.equals("pNonRefByAllele")) {
|
||||
} else if (variable.equals("pRefByAllele")) {
|
||||
final Allele a = Allele.create(key);
|
||||
log10pNonRefByAllele.put(a, Double.valueOf(value));
|
||||
log10pRefByAllele.put(a, Double.valueOf(value));
|
||||
} else if (variable.equals("runtime.nano")) {
|
||||
runtimeNano = Long.valueOf(value);
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -125,8 +125,8 @@ import java.util.*;
|
|||
*/
|
||||
final List<AFCalcResult> supporting;
|
||||
|
||||
private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List<Allele> allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map<Allele, Double> log10pNonRefByAllele, List<AFCalcResult> supporting) {
|
||||
super(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pNonRefByAllele);
|
||||
private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List<Allele> allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map<Allele, Double> log10pRefByAllele, List<AFCalcResult> supporting) {
|
||||
super(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pRefByAllele);
|
||||
this.supporting = supporting;
|
||||
}
|
||||
}
|
||||
|
|
@ -323,7 +323,7 @@ import java.util.*;
|
|||
final int nAltAlleles = sortedResultsWithThetaNPriors.size();
|
||||
final int[] alleleCountsOfMLE = new int[nAltAlleles];
|
||||
final double[] log10PriorsOfAC = new double[2];
|
||||
final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(nAltAlleles);
|
||||
final Map<Allele, Double> log10pRefByAllele = new HashMap<Allele, Double>(nAltAlleles);
|
||||
|
||||
// the sum of the log10 posteriors for AF == 0 and AF > 0 to determine joint probs
|
||||
double log10PosteriorOfACEq0Sum = 0.0;
|
||||
|
|
@ -348,7 +348,7 @@ import java.util.*;
|
|||
log10PosteriorOfACGt0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0();
|
||||
|
||||
// bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior
|
||||
log10pNonRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0());
|
||||
log10pRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0());
|
||||
|
||||
// trivial -- update the number of evaluations
|
||||
nEvaluations += sortedResultWithThetaNPriors.nEvaluations;
|
||||
|
|
@ -384,6 +384,6 @@ import java.util.*;
|
|||
MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true),
|
||||
// priors incorporate multiple alt alleles, must be normalized
|
||||
MathUtils.normalizeFromLog10(log10PriorsOfAC, true),
|
||||
log10pNonRefByAllele, sortedResultsWithThetaNPriors);
|
||||
log10pRefByAllele, sortedResultsWithThetaNPriors);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -30,13 +30,13 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc {
|
|||
final double[] log10Priors = new double[]{log10AlleleFrequencyPriors[0], MathUtils.log10sumLog10(log10AlleleFrequencyPriors, 1)};
|
||||
final double[] log10Posteriors = MathUtils.vectorSum(log10Likelihoods, log10Priors);
|
||||
|
||||
final double log10PNonRef = log10Posteriors[1] > log10Posteriors[0] ? 0.0 : MathUtils.LOG10_P_OF_ZERO;
|
||||
final Map<Allele, Double> log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), log10PNonRef);
|
||||
final double log10PRef = log10Posteriors[1] > log10Posteriors[0] ? MathUtils.LOG10_P_OF_ZERO : 0.0;
|
||||
final Map<Allele, Double> log10pRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), log10PRef);
|
||||
|
||||
return new AFCalcResult(new int[]{mleK}, 0, vc.getAlleles(),
|
||||
MathUtils.normalizeFromLog10(log10Likelihoods, true),
|
||||
MathUtils.normalizeFromLog10(log10Priors, true),
|
||||
log10pNonRefByAllele);
|
||||
log10pRefByAllele);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -165,14 +165,14 @@ final class StateTracker {
|
|||
final double[] log10Likelihoods = MathUtils.normalizeFromLog10(new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero()}, true);
|
||||
final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true);
|
||||
|
||||
final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(allelesUsedInGenotyping.size());
|
||||
final Map<Allele, Double> log10pRefByAllele = new HashMap<Allele, Double>(allelesUsedInGenotyping.size());
|
||||
for ( int i = 0; i < subACOfMLE.length; i++ ) {
|
||||
final Allele allele = allelesUsedInGenotyping.get(i+1);
|
||||
final double log10PNonRef = alleleCountsOfMAP[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was
|
||||
log10pNonRefByAllele.put(allele, log10PNonRef);
|
||||
final double log10PRef = alleleCountsOfMAP[i] > 0 ? -10000 : 0; // TODO -- a total hack but in effect what the old behavior was
|
||||
log10pRefByAllele.put(allele, log10PRef);
|
||||
}
|
||||
|
||||
return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele);
|
||||
return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pRefByAllele);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.indels;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.*;
|
||||
import net.sf.samtools.util.RuntimeIOException;
|
||||
import net.sf.samtools.util.SequenceUtil;
|
||||
|
|
@ -236,6 +236,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
* then extensions (".bam" or ".sam") will be stripped from the input file names and the provided string value will be pasted on instead; 2) if the
|
||||
* value ends with a '.map' (e.g. input_output.map), then the two-column tab-separated file with the specified name must exist and list unique output
|
||||
* file name (2nd column) for each input file name (1st column).
|
||||
*
|
||||
* Note that some GATK arguments do NOT work in conjunction with nWayOut (e.g. --disable_bam_indexing).
|
||||
*/
|
||||
@Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file")
|
||||
protected String N_WAY_OUT = null;
|
||||
|
|
@ -274,7 +276,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
protected String OUT_SNPS = null;
|
||||
|
||||
// fasta reference reader to supplement the edges of the reference sequence
|
||||
private IndexedFastaSequenceFile referenceReader;
|
||||
private CachingIndexedFastaSequenceFile referenceReader;
|
||||
|
||||
// the intervals input by the user
|
||||
private Iterator<GenomeLoc> intervals = null;
|
||||
|
|
@ -1601,7 +1603,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
|
||||
public List<GATKSAMRecord> getReads() { return reads; }
|
||||
|
||||
public byte[] getReference(IndexedFastaSequenceFile referenceReader) {
|
||||
@Requires("referenceReader.isUppercasingBases()")
|
||||
public byte[] getReference(CachingIndexedFastaSequenceFile referenceReader) {
|
||||
// set up the reference if we haven't done so yet
|
||||
if ( reference == null ) {
|
||||
// first, pad the reference to handle deletions in narrow windows (e.g. those with only 1 read)
|
||||
|
|
@ -1609,7 +1612,6 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
int padRight = Math.min(loc.getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(loc.getContig()).getSequenceLength());
|
||||
loc = getToolkit().getGenomeLocParser().createGenomeLoc(loc.getContig(), padLeft, padRight);
|
||||
reference = referenceReader.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases();
|
||||
StringUtil.toUpperCase(reference);
|
||||
}
|
||||
|
||||
return reference;
|
||||
|
|
|
|||
|
|
@ -287,6 +287,9 @@ public class PairHMMIndelErrorModel {
|
|||
if (startLocationInRefForHaplotypes < ref.getWindow().getStart()) {
|
||||
startLocationInRefForHaplotypes = ref.getWindow().getStart(); // read starts before haplotype: read will have to be cut numStartSoftClippedBases += ref.getWindow().getStart() - startLocationInRefForHaplotypes;
|
||||
}
|
||||
else if (startLocationInRefForHaplotypes > ref.getWindow().getStop()) {
|
||||
startLocationInRefForHaplotypes = ref.getWindow().getStop(); // read starts after haplotype: read will have to be clipped completely;
|
||||
}
|
||||
|
||||
if (stopLocationInRefForHaplotypes > ref.getWindow().getStop()) {
|
||||
stopLocationInRefForHaplotypes = ref.getWindow().getStop(); // check also if end of read will go beyond reference context
|
||||
|
|
@ -329,7 +332,6 @@ public class PairHMMIndelErrorModel {
|
|||
getContextHomopolymerLength(readBases,hrunProfile);
|
||||
fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities);
|
||||
|
||||
|
||||
for (Allele a: haplotypeMap.keySet()) {
|
||||
|
||||
Haplotype haplotype = haplotypeMap.get(a);
|
||||
|
|
@ -339,6 +341,8 @@ public class PairHMMIndelErrorModel {
|
|||
|
||||
if (startLocationInRefForHaplotypes < haplotype.getStartPosition())
|
||||
startLocationInRefForHaplotypes = haplotype.getStartPosition();
|
||||
else if (startLocationInRefForHaplotypes > haplotype.getStopPosition())
|
||||
startLocationInRefForHaplotypes = haplotype.getStopPosition();
|
||||
|
||||
final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition();
|
||||
final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition();
|
||||
|
|
@ -348,8 +352,6 @@ public class PairHMMIndelErrorModel {
|
|||
System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n",
|
||||
indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength(), read.getCigar().toString());
|
||||
|
||||
|
||||
|
||||
final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(),
|
||||
(int)indStart, (int)indStop);
|
||||
|
||||
|
|
|
|||
|
|
@ -811,9 +811,9 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters);
|
||||
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s",
|
||||
vc.getChr(),vc.getStart(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.getFamilyID(),
|
||||
phasedMother.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getDP(),Arrays.asList(phasedMother.getAD()),
|
||||
phasedMother.getLikelihoodsString(), phasedFather.getGenotypeString(),phasedFather.getDP(),Arrays.asList(phasedFather.getAD()),phasedFather.getLikelihoodsString(),
|
||||
phasedChild.getGenotypeString(),Arrays.asList(phasedChild.getDP()),phasedChild.getAD(),phasedChild.getLikelihoodsString());
|
||||
phasedMother.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getDP(),printAD(phasedMother.getAD()),
|
||||
phasedMother.getLikelihoodsString(), phasedFather.getGenotypeString(),phasedFather.getDP(),printAD(phasedFather.getAD()),phasedFather.getLikelihoodsString(),
|
||||
phasedChild.getGenotypeString(),phasedChild.getDP(),printAD(phasedChild.getAD()),phasedChild.getLikelihoodsString());
|
||||
if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType()))
|
||||
metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
|
||||
}
|
||||
|
|
@ -823,8 +823,8 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
|
||||
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s:%s:%s:%s\t.\t.\t.\t.\t%s\t%s\t%s\t%s",
|
||||
vc.getChr(),vc.getStart(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.getFamilyID(),
|
||||
phasedMother.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getDP(),Arrays.asList(phasedMother.getAD()),phasedMother.getLikelihoodsString(),
|
||||
phasedChild.getGenotypeString(),phasedChild.getDP(),Arrays.asList(phasedChild.getAD()),phasedChild.getLikelihoodsString());
|
||||
phasedMother.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getDP(),printAD(phasedMother.getAD()),phasedMother.getLikelihoodsString(),
|
||||
phasedChild.getGenotypeString(),phasedChild.getDP(),printAD(phasedChild.getAD()),phasedChild.getLikelihoodsString());
|
||||
}
|
||||
}
|
||||
else{
|
||||
|
|
@ -834,8 +834,8 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
|
||||
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t.\t.\t.\t.\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s",
|
||||
vc.getChr(),vc.getStart(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.getFamilyID(),
|
||||
phasedFather.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getDP(),Arrays.asList(phasedFather.getAD()),phasedFather.getLikelihoodsString(),
|
||||
phasedChild.getGenotypeString(),phasedChild.getDP(),Arrays.asList(phasedChild.getAD()),phasedChild.getLikelihoodsString());
|
||||
phasedFather.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getDP(),printAD(phasedFather.getAD()),phasedFather.getLikelihoodsString(),
|
||||
phasedChild.getGenotypeString(),phasedChild.getDP(),printAD(phasedChild.getAD()),phasedChild.getLikelihoodsString());
|
||||
}
|
||||
|
||||
//Report violation if set so
|
||||
|
|
@ -850,6 +850,18 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
return metricsCounters;
|
||||
}
|
||||
|
||||
private static String printAD(final int[] AD) {
|
||||
if ( AD == null || AD.length == 0 )
|
||||
return ".";
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
sb.append(AD[0]);
|
||||
for ( int i = 1; i < AD.length; i++) {
|
||||
sb.append(",");
|
||||
sb.append(AD[i]);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the reporting counters.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -95,7 +95,8 @@ import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFr
|
|||
|
||||
@DocumentedGATKFeature( groupName = "Variant Discovery Tools", extraDocs = {CommandLineGATK.class} )
|
||||
public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingStats> {
|
||||
private static final boolean DEBUG = false;
|
||||
@Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information (if -l DEBUG is also specified)", required = false)
|
||||
protected boolean DEBUG = false;
|
||||
/**
|
||||
* The VCF file we are phasing variants from.
|
||||
*
|
||||
|
|
@ -134,6 +135,9 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
@Argument(fullName = "permitNoSampleOverlap", shortName = "permitNoSampleOverlap", doc = "Don't exit (just WARN) when the VCF and BAMs do not overlap in samples", required = false)
|
||||
private boolean permitNoSampleOverlap = false;
|
||||
|
||||
/**
|
||||
* Important note: do not use this argument if your input data set is not already phased or it will cause the tool to skip over all heterozygous sites.
|
||||
*/
|
||||
@Argument(fullName = "respectPhaseInInput", shortName = "respectPhaseInInput", doc = "Will only phase genotypes in cases where the resulting output will necessarily be consistent with any existing phase (for example, from trios)", required = false)
|
||||
private boolean respectPhaseInInput = false;
|
||||
|
||||
|
|
@ -949,7 +953,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
}
|
||||
|
||||
if (DEBUG) logger.debug("\nPhasing table [AFTER CALCULATION]:\n" + sampleHaps + "\n");
|
||||
MaxHaplotypeAndQuality maxHapQual = new MaxHaplotypeAndQuality(sampleHaps, true);
|
||||
MaxHaplotypeAndQuality maxHapQual = new MaxHaplotypeAndQuality(sampleHaps, DEBUG);
|
||||
double posteriorProb = maxHapQual.maxEntry.getScore().getValue();
|
||||
|
||||
if (DEBUG)
|
||||
|
|
@ -971,7 +975,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
public MaxHaplotypeAndQuality(PhasingTable hapTable, boolean printDebug) {
|
||||
// Marginalize each haplotype to its first 2 positions:
|
||||
hapTable = HaplotypeTableCreator.marginalizeAsNewTable(hapTable);
|
||||
if (DEBUG && printDebug)
|
||||
if (printDebug)
|
||||
logger.debug("\nPhasing table [AFTER MAPPING]:\n" + hapTable + "\n");
|
||||
|
||||
calculateMaxHapAndPhasingQuality(hapTable, printDebug);
|
||||
|
|
@ -981,7 +985,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
|
||||
private void calculateMaxHapAndPhasingQuality(PhasingTable hapTable, boolean printDebug) {
|
||||
hapTable.normalizeScores();
|
||||
if (DEBUG && printDebug)
|
||||
if (printDebug)
|
||||
logger.debug("\nPhasing table [AFTER NORMALIZATION]:\n" + hapTable + "\n");
|
||||
|
||||
// Determine the phase at this position:
|
||||
|
|
|
|||
|
|
@ -1,149 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.qc;
|
||||
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Emits intervals present in either the original or reduced bam but not the other.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* The original and reduced BAM files.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A list of intervals present in one bam but not the other.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -I:original original.bam \
|
||||
* -I:reduced reduced.bam \
|
||||
* -R ref.fasta \
|
||||
* -T AssessReducedCoverage \
|
||||
* -o output.intervals
|
||||
* </pre>
|
||||
*
|
||||
* @author ebanks
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} )
|
||||
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class})
|
||||
@Hidden
|
||||
public class AssessReducedCoverage extends LocusWalker<GenomeLoc, GenomeLoc> implements TreeReducible<GenomeLoc> {
|
||||
|
||||
private static final String original = "original";
|
||||
private static final String reduced = "reduced";
|
||||
|
||||
@Output
|
||||
protected PrintStream out;
|
||||
|
||||
@Override
|
||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
||||
@Argument(fullName = "output_reduced_only_coverage", shortName = "output_reduced_only_coverage", doc = "Output an interval if the reduced bam has coverage where the original does not", required = false)
|
||||
public boolean OUTPUT_REDUCED_ONLY_INTERVALS = false;
|
||||
|
||||
public void initialize() {}
|
||||
|
||||
public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
|
||||
if ( tracker == null )
|
||||
return null;
|
||||
|
||||
Set<String> tags = getAllTags(context.getBasePileup());
|
||||
return (tags.contains(original) && !tags.contains(reduced)) ||
|
||||
(OUTPUT_REDUCED_ONLY_INTERVALS && tags.contains(reduced) && !tags.contains(original)) ? ref.getLocus() : null;
|
||||
}
|
||||
|
||||
private Set<String> getAllTags(final ReadBackedPileup pileup) {
|
||||
|
||||
final Set<String> tags = new HashSet<String>(10);
|
||||
|
||||
for ( final PileupElement p : pileup ) {
|
||||
if ( (int)p.getQual() > 2 && p.getMappingQual() > 0 && !p.isDeletion() )
|
||||
tags.addAll(getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags());
|
||||
}
|
||||
|
||||
return tags;
|
||||
}
|
||||
|
||||
public void onTraversalDone(GenomeLoc sum) {
|
||||
if ( sum != null )
|
||||
out.println(sum);
|
||||
}
|
||||
|
||||
public GenomeLoc reduceInit() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) {
|
||||
if ( lhs == null )
|
||||
return rhs;
|
||||
|
||||
if ( rhs == null )
|
||||
return lhs;
|
||||
|
||||
// if contiguous, just merge them
|
||||
if ( lhs.contiguousP(rhs) )
|
||||
return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop());
|
||||
|
||||
// otherwise, print the lhs and start over with the rhs
|
||||
out.println(lhs);
|
||||
return rhs;
|
||||
}
|
||||
|
||||
public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) {
|
||||
if ( value == null )
|
||||
return sum;
|
||||
|
||||
if ( sum == null )
|
||||
return value;
|
||||
|
||||
// if contiguous, just merge them
|
||||
if ( sum.contiguousP(value) )
|
||||
return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop());
|
||||
|
||||
// otherwise, print the sum and start over with the value
|
||||
out.println(sum);
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,173 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.qc;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Emits intervals in which the differences between the original and reduced bam quals are bigger epsilon (unless the quals of
|
||||
* the reduced bam are above sufficient threshold)
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* The original and reduced BAM files.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A list of intervals in which the differences between the original and reduced bam quals are bigger epsilon.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -I:original original.bam \
|
||||
* -I:reduced reduced.bam \
|
||||
* -R ref.fasta \
|
||||
* -T AssessReducedQuals \
|
||||
* -o output.intervals
|
||||
* </pre>
|
||||
*
|
||||
* @author ami
|
||||
*/
|
||||
|
||||
public class AssessReducedQuals extends LocusWalker<GenomeLoc, GenomeLoc> implements TreeReducible<GenomeLoc> {
|
||||
|
||||
private static final String reduced = "reduced";
|
||||
private static final String original = "original";
|
||||
private static final int originalQualsIndex = 0;
|
||||
private static final int reducedQualsIndex = 1;
|
||||
|
||||
@Argument(fullName = "sufficientQualSum", shortName = "sufficientQualSum", doc = "When a reduced bam qual sum is above this threshold, it passes even without comparing to the non-reduced bam ", required = false)
|
||||
public int sufficientQualSum = 600;
|
||||
|
||||
@Argument(fullName = "qual_epsilon", shortName = "epsilon", doc = "when |Quals_reduced_bam - Quals_original_bam| > epsilon*Quals_original_bam we output this interval", required = false)
|
||||
public int qual_epsilon = 0;
|
||||
|
||||
@Argument(fullName = "debugLevel", shortName = "debug", doc = "debug mode on") // TODO -- best to make this optional
|
||||
public int debugLevel = 0; // TODO -- best to make this an enum or boolean
|
||||
|
||||
@Output
|
||||
protected PrintStream out;
|
||||
|
||||
public void initialize() {
|
||||
if (debugLevel != 0)
|
||||
out.println(" Debug mode" +
|
||||
"Debug:\tsufficientQualSum: "+sufficientQualSum+ "\n " +
|
||||
"Debug:\tqual_epsilon: "+qual_epsilon);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
||||
@Override
|
||||
public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null )
|
||||
return null;
|
||||
|
||||
boolean reportLocus;
|
||||
final int[] quals = getPileupQuals(context.getBasePileup());
|
||||
if (debugLevel != 0)
|
||||
out.println("Debug:\tLocus Quals\t"+ref.getLocus()+"\toriginal\t"+quals[originalQualsIndex]+"\treduced\t"+quals[reducedQualsIndex]);
|
||||
final int epsilon = MathUtils.fastRound(quals[originalQualsIndex]*qual_epsilon);
|
||||
final int calcOriginalQuals = Math.min(quals[originalQualsIndex],sufficientQualSum);
|
||||
final int calcReducedQuals = Math.min(quals[reducedQualsIndex],sufficientQualSum);
|
||||
final int OriginalReducedQualDiff = calcOriginalQuals - calcReducedQuals;
|
||||
reportLocus = OriginalReducedQualDiff > epsilon || OriginalReducedQualDiff < -1*epsilon;
|
||||
if(debugLevel != 0 && reportLocus)
|
||||
out.println("Debug:\tEmited Locus\t"+ref.getLocus()+"\toriginal\t"+quals[originalQualsIndex]+"\treduced\t"+quals[reducedQualsIndex]+"\tepsilon\t"+epsilon+"\tdiff\t"+OriginalReducedQualDiff);
|
||||
|
||||
return reportLocus ? ref.getLocus() : null;
|
||||
}
|
||||
|
||||
private final int[] getPileupQuals(final ReadBackedPileup readPileup) {
|
||||
|
||||
final int[] quals = new int[2];
|
||||
String[] printPileup = {"Debug 2:\toriginal pileup:\t"+readPileup.getLocation()+"\nDebug 2:----------------------------------\n",
|
||||
"Debug 2:\t reduced pileup:\t"+readPileup.getLocation()+"\nDebug 2:----------------------------------\n"};
|
||||
|
||||
for( PileupElement p : readPileup ){
|
||||
final List<String> tags = getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags();
|
||||
if ( isGoodRead(p,tags) ){
|
||||
final int tempQual = (int)(p.getQual()) * p.getRepresentativeCount();
|
||||
final int tagIndex = getTagIndex(tags);
|
||||
quals[tagIndex] += tempQual;
|
||||
if(debugLevel == 2)
|
||||
printPileup[tagIndex] += "\tDebug 2: ("+p+")\tMQ="+p.getMappingQual()+":QU="+p.getQual()+":RC="+p.getRepresentativeCount()+":OS="+p.getOffset()+"\n";
|
||||
}
|
||||
}
|
||||
if(debugLevel == 2){
|
||||
out.println(printPileup[originalQualsIndex]);
|
||||
out.println(printPileup[reducedQualsIndex]);
|
||||
}
|
||||
return quals;
|
||||
}
|
||||
|
||||
// TODO -- arguments/variables should be final, not method declaration
|
||||
private final boolean isGoodRead(PileupElement p, List<String> tags){
|
||||
// TODO -- this isn't quite right. You don't need the tags here. Instead, you want to check whether the read itself (which
|
||||
// TODO -- you can get from the PileupElement) is a reduced read (not all reads from the reduced bam are reduced) and only
|
||||
// TODO -- for them do you want to ignore that min mapping quality cutoff (but you *do* still want the min base cutoff).
|
||||
return !p.isDeletion() && (tags.contains(reduced) || (tags.contains(original) && (int)p.getQual() >= 20 && p.getMappingQual() >= 20));
|
||||
}
|
||||
|
||||
private final int getTagIndex(List<String> tags){
|
||||
return tags.contains(reduced) ? 1 : 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTraversalDone(GenomeLoc sum) {
|
||||
if ( sum != null )
|
||||
out.println(sum);
|
||||
}
|
||||
|
||||
@Override
|
||||
public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) {
|
||||
if ( lhs == null )
|
||||
return rhs;
|
||||
|
||||
if ( rhs == null )
|
||||
return lhs;
|
||||
|
||||
// if contiguous, just merge them
|
||||
if ( lhs.contiguousP(rhs) )
|
||||
return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop());
|
||||
|
||||
// otherwise, print the lhs and start over with the rhs
|
||||
out.println(lhs);
|
||||
return rhs;
|
||||
}
|
||||
|
||||
@Override
|
||||
public GenomeLoc reduceInit() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) {
|
||||
if ( value == null )
|
||||
return sum;
|
||||
|
||||
if ( sum == null )
|
||||
return value;
|
||||
|
||||
// if contiguous, just merge them
|
||||
if ( sum.contiguousP(value) )
|
||||
return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop());
|
||||
|
||||
// otherwise, print the sum and start over with the value
|
||||
out.println(sum);
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
|
@ -183,6 +183,10 @@ public class VariantEval extends RodWalker<Integer, Integer> implements TreeRedu
|
|||
@Argument(fullName="keepAC0", shortName="keepAC0", doc="If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes", required=false)
|
||||
private boolean keepSitesWithAC0 = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="numSamples", shortName="numSamples", doc="If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes", required=false)
|
||||
private int numSamplesFromArgument = 0;
|
||||
|
||||
/**
|
||||
* If true, VariantEval will treat -eval 1 -eval 2 as separate tracks from the same underlying
|
||||
* variant set, and evaluate the union of the results. Useful when you want to do -eval chr1.vcf -eval chr2.vcf etc.
|
||||
|
|
@ -589,6 +593,14 @@ public class VariantEval extends RodWalker<Integer, Integer> implements TreeRedu
|
|||
public boolean isSubsettingToSpecificSamples() { return isSubsettingSamples; }
|
||||
public Set<String> getSampleNamesForEvaluation() { return sampleNamesForEvaluation; }
|
||||
|
||||
public int getNumberOfSamplesForEvaluation() {
|
||||
if (sampleNamesForEvaluation!= null && !sampleNamesForEvaluation.isEmpty())
|
||||
return sampleNamesForEvaluation.size();
|
||||
else {
|
||||
return numSamplesFromArgument;
|
||||
}
|
||||
|
||||
}
|
||||
public Set<String> getSampleNamesForStratification() { return sampleNamesForStratification; }
|
||||
|
||||
public List<RodBinding<VariantContext>> getComps() { return comps; }
|
||||
|
|
|
|||
|
|
@ -1,249 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* @author rpoplin
|
||||
* @since Apr 6, 2010
|
||||
*/
|
||||
|
||||
//@Analysis(name = "Variant Quality Score", description = "Shows various stats of sets of variants binned by variant quality score")
|
||||
@Deprecated
|
||||
public class VariantQualityScore {
|
||||
// TODO - this should really be a stratification
|
||||
|
||||
// public class VariantQualityScore extends VariantEvaluator {
|
||||
//
|
||||
// // a mapping from quality score histogram bin to Ti/Tv ratio
|
||||
// @DataPoint(description = "the Ti/Tv ratio broken out by variant quality")
|
||||
// TiTvStats titvStats = null;
|
||||
//
|
||||
// @DataPoint(description = "average variant quality for each allele count")
|
||||
// AlleleCountStats alleleCountStats = null;
|
||||
//
|
||||
// static class TiTvStats extends TableType {
|
||||
// final static int NUM_BINS = 20;
|
||||
// final HashMap<Integer, Pair<Long,Long>> qualByIsTransition = new HashMap<Integer, Pair<Long,Long>>(); // A hashMap holds all the qualities until we are able to bin them appropriately
|
||||
// final long transitionByQuality[] = new long[NUM_BINS];
|
||||
// final long transversionByQuality[] = new long[NUM_BINS];
|
||||
// final double titvByQuality[] = new double[NUM_BINS]; // the final ti/tv sets that get reported out
|
||||
//
|
||||
// public Object[] getRowKeys() {
|
||||
// return new String[]{"sample"};
|
||||
// }
|
||||
//
|
||||
// public Object[] getColumnKeys() {
|
||||
// final String columnKeys[] = new String[NUM_BINS];
|
||||
// for( int iii = 0; iii < NUM_BINS; iii++ ) {
|
||||
// columnKeys[iii] = "titvBin" + iii;
|
||||
// }
|
||||
// return columnKeys;
|
||||
// }
|
||||
//
|
||||
// public String getCell(int x, int y) {
|
||||
// return String.valueOf(titvByQuality[y]);
|
||||
// }
|
||||
//
|
||||
// public String toString() {
|
||||
// StringBuffer returnString = new StringBuffer();
|
||||
// // output the ti/tv array
|
||||
// returnString.append("titvByQuality: ");
|
||||
// for( int iii = 0; iii < NUM_BINS; iii++ ) {
|
||||
// returnString.append(titvByQuality[iii]);
|
||||
// returnString.append(" ");
|
||||
// }
|
||||
// return returnString.toString();
|
||||
// }
|
||||
//
|
||||
// public void incrValue( final double qual, final boolean isTransition ) {
|
||||
// final Integer qualKey = Math.round((float) qual);
|
||||
// final long numTransition = (isTransition ? 1L : 0L);
|
||||
// final long numTransversion = (isTransition ? 0L : 1L);
|
||||
// if( qualByIsTransition.containsKey(qualKey) ) {
|
||||
// Pair<Long,Long> transitionPair = qualByIsTransition.get(qualKey);
|
||||
// transitionPair.set(transitionPair.getFirst() + numTransition, transitionPair.getSecond() + numTransversion);
|
||||
// qualByIsTransition.put(qualKey, transitionPair);
|
||||
// } else {
|
||||
// qualByIsTransition.put(qualKey, new Pair<Long,Long>(numTransition,numTransversion));
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// public void organizeTiTvTables() {
|
||||
// for( int iii = 0; iii < NUM_BINS; iii++ ) {
|
||||
// transitionByQuality[iii] = 0L;
|
||||
// transversionByQuality[iii] = 0L;
|
||||
// titvByQuality[iii] = 0.0;
|
||||
// }
|
||||
//
|
||||
// int maxQual = 0;
|
||||
//
|
||||
// // Calculate the maximum quality score in order to normalize and histogram
|
||||
// for( final Integer qual : qualByIsTransition.keySet() ) {
|
||||
// if( qual > maxQual ) {
|
||||
// maxQual = qual;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// final double binSize = ((double)maxQual) / ((double) (NUM_BINS-1));
|
||||
//
|
||||
// for( final Integer qual : qualByIsTransition.keySet() ) {
|
||||
// final int index = (int)Math.floor( ((double) qual) / binSize );
|
||||
// if( index >= 0 ) { // BUGBUG: why is there overflow here?
|
||||
// Pair<Long,Long> transitionPair = qualByIsTransition.get(qual);
|
||||
// transitionByQuality[index] += transitionPair.getFirst();
|
||||
// transversionByQuality[index] += transitionPair.getSecond();
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// for( int iii = 0; iii < NUM_BINS; iii++ ) {
|
||||
// if( transitionByQuality[iii] + transversionByQuality[iii] > 800L ) { // need to have a sufficient number of variants to get a useful Ti/Tv ratio
|
||||
// titvByQuality[iii] = ((double) transitionByQuality[iii]) / ((double) transversionByQuality[iii]);
|
||||
// } else {
|
||||
// titvByQuality[iii] = 0.0;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// class AlleleCountStats extends TableType {
|
||||
// final HashMap<Integer, ArrayList<Double>> qualityListMap = new HashMap<Integer, ArrayList<Double>>();
|
||||
// final HashMap<Integer, Double> qualityMap = new HashMap<Integer, Double>();
|
||||
//
|
||||
// public Object[] getRowKeys() {
|
||||
// final int NUM_BINS = qualityListMap.keySet().size();
|
||||
// final String rowKeys[] = new String[NUM_BINS];
|
||||
// int iii = 0;
|
||||
// for( final Integer key : qualityListMap.keySet() ) {
|
||||
// rowKeys[iii] = "AC" + key;
|
||||
// iii++;
|
||||
// }
|
||||
// return rowKeys;
|
||||
//
|
||||
// }
|
||||
//
|
||||
// public Object[] getColumnKeys() {
|
||||
// return new String[]{"alleleCount","avgQual"};
|
||||
// }
|
||||
//
|
||||
// public String getCell(int x, int y) {
|
||||
// int iii = 0;
|
||||
// for( final Integer key : qualityListMap.keySet() ) {
|
||||
// if(iii == x) {
|
||||
// if(y == 0) { return String.valueOf(key); }
|
||||
// else { return String.valueOf(qualityMap.get(key)); }
|
||||
// }
|
||||
// iii++;
|
||||
// }
|
||||
// return null;
|
||||
// }
|
||||
//
|
||||
// public String toString() {
|
||||
// String returnString = "";
|
||||
// // output the quality map
|
||||
// returnString += "AlleleCountStats: ";
|
||||
// //for( int iii = 0; iii < NUM_BINS; iii++ ) {
|
||||
// // returnString += titvByQuality[iii] + " ";
|
||||
// //}
|
||||
// return returnString;
|
||||
// }
|
||||
//
|
||||
// public void incrValue( final double qual, final int alleleCount ) {
|
||||
// ArrayList<Double> list = qualityListMap.get(alleleCount);
|
||||
// if(list==null) { list = new ArrayList<Double>(); }
|
||||
// list.add(qual);
|
||||
// qualityListMap.put(alleleCount, list);
|
||||
// }
|
||||
//
|
||||
// public void organizeAlleleCountTables() {
|
||||
// for( final Integer key : qualityListMap.keySet() ) {
|
||||
// final ArrayList<Double> list = qualityListMap.get(key);
|
||||
// double meanQual = 0.0;
|
||||
// final double numQuals = (double)list.size();
|
||||
// for( Double qual : list ) {
|
||||
// meanQual += qual / numQuals;
|
||||
// }
|
||||
// qualityMap.put(key, meanQual);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// //public VariantQualityScore(VariantEvalWalker parent) {
|
||||
// //super(parent);
|
||||
// //}
|
||||
//
|
||||
// public String getName() {
|
||||
// return "VariantQualityScore";
|
||||
// }
|
||||
//
|
||||
// public int getComparisonOrder() {
|
||||
// return 1; // we only need to see each eval track
|
||||
// }
|
||||
//
|
||||
// public String toString() {
|
||||
// return getName();
|
||||
// }
|
||||
//
|
||||
// public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
// final String interesting = null;
|
||||
//
|
||||
// if( eval != null && eval.isSNP() && eval.isBiallelic() && eval.isPolymorphicInSamples() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites)
|
||||
// if( titvStats == null ) { titvStats = new TiTvStats(); }
|
||||
// titvStats.incrValue(eval.getPhredScaledQual(), VariantContextUtils.isTransition(eval));
|
||||
//
|
||||
// if( alleleCountStats == null ) { alleleCountStats = new AlleleCountStats(); }
|
||||
// int alternateAlleleCount = 0;
|
||||
// for (final Allele a : eval.getAlternateAlleles()) {
|
||||
// alternateAlleleCount += eval.getCalledChrCount(a);
|
||||
// }
|
||||
// alleleCountStats.incrValue(eval.getPhredScaledQual(), alternateAlleleCount);
|
||||
// }
|
||||
//
|
||||
// return interesting; // This module doesn't capture any interesting sites, so return null
|
||||
// }
|
||||
//
|
||||
// public void finalizeEvaluation() {
|
||||
// if( titvStats != null ) {
|
||||
// titvStats.organizeTiTvTables();
|
||||
// }
|
||||
// if( alleleCountStats != null ) {
|
||||
// alleleCountStats.organizeAlleleCountTables();
|
||||
// }
|
||||
// }
|
||||
}
|
||||
|
|
@ -29,7 +29,7 @@ public class AlleleCount extends VariantStratifier {
|
|||
|
||||
// There are ploidy x n sample chromosomes
|
||||
// TODO -- generalize to handle multiple ploidy
|
||||
nchrom = getVariantEvalWalker().getSampleNamesForEvaluation().size() * getVariantEvalWalker().getSamplePloidy();
|
||||
nchrom = getVariantEvalWalker().getNumberOfSamplesForEvaluation() * getVariantEvalWalker().getSamplePloidy();
|
||||
if ( nchrom < 2 )
|
||||
throw new UserException.BadArgumentValue("AlleleCount", "AlleleCount stratification requires an eval vcf with at least one sample");
|
||||
|
||||
|
|
|
|||
|
|
@ -81,7 +81,7 @@ public class VariantDataManager {
|
|||
final double theSTD = standardDeviation(theMean, iii);
|
||||
logger.info( annotationKeys.get(iii) + String.format(": \t mean = %.2f\t standard deviation = %.2f", theMean, theSTD) );
|
||||
if( Double.isNaN(theMean) ) {
|
||||
throw new UserException.BadInput("Values for " + annotationKeys.get(iii) + " annotation not detected for ANY training variant in the input callset. VariantAnnotator may be used to add these annotations. See " + HelpUtils.GATK_FORUM_URL + "discussion/49/using-variant-annotator");
|
||||
throw new UserException.BadInput("Values for " + annotationKeys.get(iii) + " annotation not detected for ANY training variant in the input callset. VariantAnnotator may be used to add these annotations. See " + HelpUtils.forumPost("discussion/49/using-variant-annotator"));
|
||||
}
|
||||
|
||||
foundZeroVarianceAnnotation = foundZeroVarianceAnnotation || (theSTD < 1E-6);
|
||||
|
|
|
|||
|
|
@ -134,7 +134,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> implements Tree
|
|||
protected VariantContextWriter vcfWriter = null;
|
||||
|
||||
@Argument(shortName="genotypeMergeOptions", doc="Determines how we should merge genotype records for samples shared across the ROD files", required=false)
|
||||
public VariantContextUtils.GenotypeMergeType genotypeMergeOption = VariantContextUtils.GenotypeMergeType.PRIORITIZE;
|
||||
public VariantContextUtils.GenotypeMergeType genotypeMergeOption = null;
|
||||
|
||||
@Argument(shortName="filteredRecordsMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required=false)
|
||||
public VariantContextUtils.FilteredRecordMergeType filteredRecordsMergeType = VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED;
|
||||
|
|
@ -200,13 +200,13 @@ public class CombineVariants extends RodWalker<Integer, Integer> implements Tree
|
|||
} else
|
||||
logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option");
|
||||
|
||||
if ( PRIORITY_STRING == null ) {
|
||||
validateAnnotateUnionArguments();
|
||||
if ( PRIORITY_STRING == null && genotypeMergeOption == null) {
|
||||
genotypeMergeOption = VariantContextUtils.GenotypeMergeType.UNSORTED;
|
||||
//PRIORITY_STRING = Utils.join(",", vcfRods.keySet()); Deleted by Ami (7/10/12)
|
||||
logger.info("Priority string not provided, using arbitrary genotyping order: " + PRIORITY_STRING);
|
||||
logger.info("Priority string not provided, using arbitrary genotyping order: "+priority);
|
||||
}
|
||||
|
||||
validateAnnotateUnionArguments();
|
||||
samples = sitesOnlyVCF ? Collections.<String>emptySet() : SampleUtils.getSampleList(vcfRods, genotypeMergeOption);
|
||||
|
||||
if ( SET_KEY.toLowerCase().equals("null") )
|
||||
|
|
@ -228,22 +228,22 @@ public class CombineVariants extends RodWalker<Integer, Integer> implements Tree
|
|||
if ( genotypeMergeOption == VariantContextUtils.GenotypeMergeType.PRIORITIZE && PRIORITY_STRING == null )
|
||||
throw new UserException.MissingArgument("rod_priority_list", "Priority string must be provided if you want to prioritize genotypes");
|
||||
|
||||
if ( genotypeMergeOption == VariantContextUtils.GenotypeMergeType.PRIORITIZE )
|
||||
if ( PRIORITY_STRING != null){
|
||||
priority = new ArrayList<String>(Arrays.asList(PRIORITY_STRING.split(",")));
|
||||
else
|
||||
priority = new ArrayList<String>(rodNames);
|
||||
if ( rodNames.size() != priority.size() )
|
||||
throw new UserException.BadArgumentValue("rod_priority_list", "The priority list must contain exactly one rod binding per ROD provided to the GATK: rodNames=" + rodNames + " priority=" + priority);
|
||||
|
||||
if ( rodNames.size() != priority.size() )
|
||||
throw new UserException.BadArgumentValue("rod_priority_list", "The priority list must contain exactly one rod binding per ROD provided to the GATK: rodNames=" + rodNames + " priority=" + priority);
|
||||
if ( ! rodNames.containsAll(priority) )
|
||||
throw new UserException.BadArgumentValue("rod_priority_list", "Not all priority elements provided as input RODs: " + PRIORITY_STRING);
|
||||
}
|
||||
|
||||
if ( ! rodNames.containsAll(priority) )
|
||||
throw new UserException.BadArgumentValue("rod_priority_list", "Not all priority elements provided as input RODs: " + PRIORITY_STRING);
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null ) // RodWalkers can make funky map calls
|
||||
return 0;
|
||||
|
||||
Set<String> rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null);
|
||||
// get all of the vcf rods at this locus
|
||||
// Need to provide reference bases to simpleMerge starting at current locus
|
||||
Collection<VariantContext> vcs = tracker.getValues(variants, context.getLocation());
|
||||
|
|
@ -290,13 +290,13 @@ public class CombineVariants extends RodWalker<Integer, Integer> implements Tree
|
|||
for (VariantContext.Type type : VariantContext.Type.values()) {
|
||||
if (VCsByType.containsKey(type))
|
||||
mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type),
|
||||
priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
|
||||
priority, rodNames.size() , filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
|
||||
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
|
||||
}
|
||||
}
|
||||
else if (multipleAllelesMergeType == VariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) {
|
||||
mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), vcs,
|
||||
priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
|
||||
priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
|
||||
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
|
||||
}
|
||||
else {
|
||||
|
|
|
|||
|
|
@ -151,14 +151,6 @@ import java.util.*;
|
|||
* -mvq 50 \
|
||||
* -o violations.vcf
|
||||
*
|
||||
* Creating a sample of exactly 1000 variants randomly chosen with equal probability from the variant VCF:
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T SelectVariants \
|
||||
* --variant input.vcf \
|
||||
* -o output.vcf \
|
||||
* -number 1000
|
||||
*
|
||||
* Creating a set with 50% of the total number of variants in the variant VCF:
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
|
|
|
|||
|
|
@ -210,13 +210,23 @@ public class JnaSession implements Session {
|
|||
}
|
||||
|
||||
public static void setAttribute(Pointer jt, String name, String value) throws DrmaaException {
|
||||
checkError(LibDrmaa.drmaa_set_attribute(jt, name, value, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
|
||||
if (getAttrNames().contains(name)) {
|
||||
checkError(LibDrmaa.drmaa_set_attribute(jt, name, value, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
|
||||
}
|
||||
else {
|
||||
throw new InvalidAttributeValueException("Attribute " + name + " is not supported by this implementation of DRMAA");
|
||||
}
|
||||
}
|
||||
|
||||
public static String getAttribute(Pointer jt, String name) throws DrmaaException {
|
||||
Memory attrBuffer = new Memory(LibDrmaa.DRMAA_ATTR_BUFFER);
|
||||
checkError(LibDrmaa.drmaa_get_attribute(jt, name, attrBuffer, LibDrmaa.DRMAA_ATTR_BUFFER_LEN, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
|
||||
return attrBuffer.getString(0);
|
||||
if (getAttrNames().contains(name)) {
|
||||
Memory attrBuffer = new Memory(LibDrmaa.DRMAA_ATTR_BUFFER);
|
||||
checkError(LibDrmaa.drmaa_get_attribute(jt, name, attrBuffer, LibDrmaa.DRMAA_ATTR_BUFFER_LEN, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
|
||||
return attrBuffer.getString(0);
|
||||
}
|
||||
else {
|
||||
throw new InvalidAttributeValueException("Attribute " + name + " is not supported by this implementation of DRMAA");
|
||||
}
|
||||
}
|
||||
|
||||
public static void setVectorAttribute(Pointer jt, String name, Collection<String> values) throws DrmaaException {
|
||||
|
|
|
|||
|
|
@ -24,33 +24,6 @@ public class BaseUtils {
|
|||
public final static byte[] BASES = {'A', 'C', 'G', 'T'};
|
||||
public final static byte[] EXTENDED_BASES = {'A', 'C', 'G', 'T', 'N', 'D'};
|
||||
|
||||
public enum Base {
|
||||
A('A', 0),
|
||||
C('C', 1),
|
||||
G('G', 2),
|
||||
T('T', 3);
|
||||
|
||||
byte b;
|
||||
int index;
|
||||
|
||||
private Base(char base, int index) {
|
||||
this.b = (byte) base;
|
||||
this.index = index;
|
||||
}
|
||||
|
||||
public byte getBase() { return b; }
|
||||
|
||||
public char getBaseAsChar() { return (char) b; }
|
||||
|
||||
public int getIndex() { return index; }
|
||||
|
||||
public boolean sameBase(byte o) { return b == o; }
|
||||
|
||||
public boolean sameBase(char o) { return b == (byte) o; }
|
||||
|
||||
public boolean sameBase(int i) { return index == i; }
|
||||
}
|
||||
|
||||
static private final int[] baseIndexMap = new int[256];
|
||||
static {
|
||||
Arrays.fill(baseIndexMap, -1);
|
||||
|
|
@ -130,6 +103,17 @@ public class BaseUtils {
|
|||
return false;
|
||||
}
|
||||
|
||||
public static boolean isUpperCase(final byte[] bases) {
|
||||
for ( byte base : bases )
|
||||
if ( ! isUpperCase(base) )
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
public static boolean isUpperCase(final byte base) {
|
||||
return base >= 'A' && base <= 'Z';
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a IUPAC nucleotide code to a pair of bases
|
||||
*
|
||||
|
|
@ -271,59 +255,6 @@ public class BaseUtils {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a base index to a base index representing its cross-talk partner
|
||||
*
|
||||
* @param baseIndex 0, 1, 2, 3
|
||||
* @return 1, 0, 3, 2, or -1 if the index can't be understood
|
||||
*/
|
||||
static public int crossTalkPartnerIndex(int baseIndex) {
|
||||
switch (baseIndex) {
|
||||
case 0:
|
||||
return 1; // A -> C
|
||||
case 1:
|
||||
return 0; // C -> A
|
||||
case 2:
|
||||
return 3; // G -> T
|
||||
case 3:
|
||||
return 2; // T -> G
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a base to the base representing its cross-talk partner
|
||||
*
|
||||
* @param base [AaCcGgTt]
|
||||
* @return C, A, T, G, or '.' if the base can't be understood
|
||||
*/
|
||||
@Deprecated
|
||||
static public char crossTalkPartnerBase(char base) {
|
||||
return (char) baseIndexToSimpleBase(crossTalkPartnerIndex(simpleBaseToBaseIndex(base)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the complement of a base index.
|
||||
*
|
||||
* @param baseIndex the base index (0:A, 1:C, 2:G, 3:T)
|
||||
* @return the complementary base index
|
||||
*/
|
||||
static public byte complementIndex(int baseIndex) {
|
||||
switch (baseIndex) {
|
||||
case 0:
|
||||
return 3; // a -> t
|
||||
case 1:
|
||||
return 2; // c -> g
|
||||
case 2:
|
||||
return 1; // g -> c
|
||||
case 3:
|
||||
return 0; // t -> a
|
||||
default:
|
||||
return -1; // wtf?
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the complement (A <-> T or C <-> G) of a base, or the specified base if it can't be complemented (i.e. an ambiguous base).
|
||||
*
|
||||
|
|
@ -350,7 +281,7 @@ public class BaseUtils {
|
|||
}
|
||||
|
||||
@Deprecated
|
||||
static public char simpleComplement(char base) {
|
||||
static private char simpleComplement(char base) {
|
||||
return (char) simpleComplement((byte) base);
|
||||
}
|
||||
|
||||
|
|
@ -370,22 +301,6 @@ public class BaseUtils {
|
|||
return rcbases;
|
||||
}
|
||||
|
||||
/**
|
||||
* Complement a byte array of bases (that is, chars casted to bytes, *not* base indices in byte form)
|
||||
*
|
||||
* @param bases the byte array of bases
|
||||
* @return the complement of the base byte array
|
||||
*/
|
||||
static public byte[] simpleComplement(byte[] bases) {
|
||||
byte[] rcbases = new byte[bases.length];
|
||||
|
||||
for (int i = 0; i < bases.length; i++) {
|
||||
rcbases[i] = simpleComplement(bases[i]);
|
||||
}
|
||||
|
||||
return rcbases;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reverse complement a char array of bases
|
||||
*
|
||||
|
|
@ -403,23 +318,6 @@ public class BaseUtils {
|
|||
return rcbases;
|
||||
}
|
||||
|
||||
/**
|
||||
* Complement a char array of bases
|
||||
*
|
||||
* @param bases the char array of bases
|
||||
* @return the complement of the base char array
|
||||
*/
|
||||
@Deprecated
|
||||
static public char[] simpleComplement(char[] bases) {
|
||||
char[] rcbases = new char[bases.length];
|
||||
|
||||
for (int i = 0; i < bases.length; i++) {
|
||||
rcbases[i] = simpleComplement(bases[i]);
|
||||
}
|
||||
|
||||
return rcbases;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reverse complement a String of bases. Preserves ambiguous bases.
|
||||
*
|
||||
|
|
@ -431,17 +329,6 @@ public class BaseUtils {
|
|||
return new String(simpleReverseComplement(bases.getBytes()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Complement a String of bases. Preserves ambiguous bases.
|
||||
*
|
||||
* @param bases the String of bases
|
||||
* @return the complement of the String
|
||||
*/
|
||||
@Deprecated
|
||||
static public String simpleComplement(String bases) {
|
||||
return new String(simpleComplement(bases.getBytes()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the uppercased version of the bases
|
||||
*
|
||||
|
|
@ -543,82 +430,4 @@ public class BaseUtils {
|
|||
|
||||
return randomBaseIndex;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a random base (A, C, G, T).
|
||||
*
|
||||
* @return a random base (A, C, G, T)
|
||||
*/
|
||||
@Deprecated
|
||||
static public byte getRandomBase() {
|
||||
return getRandomBase('.');
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a random base, excluding some base.
|
||||
*
|
||||
* @param excludeBase the base to exclude
|
||||
* @return a random base, excluding the one specified (A, C, G, T)
|
||||
*/
|
||||
@Deprecated
|
||||
static public byte getRandomBase(char excludeBase) {
|
||||
return BaseUtils.baseIndexToSimpleBase(getRandomBaseIndex(BaseUtils.simpleBaseToBaseIndex(excludeBase)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the smallest period >= minPeriod for the specified string. The period is defined as such p,
|
||||
* that for all i = 0... seq.length-1, seq[ i % p ] = seq[i] (or equivalently seq[i] = seq[i+p] for i=0...seq.length-1-p).
|
||||
* The sequence does <i>not</i> have to contain whole number of periods. For instance, "ACACACAC" has a period
|
||||
* of 2 (it has a period of 4 as well), and so does
|
||||
* "ACACA"; similarly, smallest periods of "CTCCTC", "CTCCT", and "CTCC" are all equal to 3. The "trivial" period is
|
||||
* the length of the string itself, and it will always be returned if no smaller period can be found in the specified period range
|
||||
* or if specified minPeriod is greater than the sequence length.
|
||||
*
|
||||
* @param seq
|
||||
* @return
|
||||
*/
|
||||
public static int sequencePeriod(byte[] seq, int minPeriod) {
|
||||
int period = (minPeriod > seq.length ? seq.length : minPeriod);
|
||||
// we assume that bases [0,period-1] repeat themselves and check this assumption
|
||||
// until we find correct period
|
||||
|
||||
for (int pos = period; pos < seq.length; pos++) {
|
||||
|
||||
int offset = pos % period; // we are currenlty 'offset' bases into the putative repeat of period 'period'
|
||||
// if our current hypothesis holds, base[pos] must be the same as base[offset]
|
||||
|
||||
if (Character.toUpperCase(seq[pos]) != Character.toUpperCase(seq[offset])) {
|
||||
|
||||
// period we have been trying so far does not work.
|
||||
// two possibilities:
|
||||
// A) offset = 0, i.e. current position pos must be start of the next repeat, but it is not;
|
||||
// in this case only bases from start up to the current one, inclusive, may form a repeat, if at all;
|
||||
// so period is at least pos+1 (remember, pos is 0-based), then on the next loop re-entrance
|
||||
// pos will be autoincremented and we will be checking next base
|
||||
// B) offset != 0, i.e. the current base breaks the repeat, but maybe it starts a new one?
|
||||
// hence we should first check if it matches the first base of the sequence, and to do that
|
||||
// we set period to pos (thus trying the hypothesis that bases from start up to the current one,
|
||||
// non-inclusive are repeated hereafter), and decrement pos (this will re-test current base against the first base
|
||||
// on the next loop re-entrance after pos is autoincremented)
|
||||
if (offset == 0)
|
||||
period = pos + 1;
|
||||
else
|
||||
period = pos--;
|
||||
|
||||
}
|
||||
}
|
||||
return period;
|
||||
}
|
||||
}
|
||||
|
||||
/* code snippet for testing sequencePeriod():
|
||||
*
|
||||
* String str = "CCTTG";
|
||||
int p = 0;
|
||||
System.out.print("Periods of " + str +" are:");
|
||||
while ( p < str.length() ) {
|
||||
p = sequencePeriod(str, p+1);
|
||||
System.out.print(" "+p);
|
||||
}
|
||||
System.out.println(); System.exit(1);
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -315,6 +315,20 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
|
|||
return ( comparison == -1 || ( comparison == 0 && this.getStop() < that.getStart() ));
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests whether this genome loc starts at the same position as that.
|
||||
*
|
||||
* i.e., do this and that have the same contig and the same start position
|
||||
*
|
||||
* @param that genome loc to compare to
|
||||
* @return true if this and that have the same contig and the same start position
|
||||
*/
|
||||
@Requires("that != null")
|
||||
public final boolean startsAt( GenomeLoc that ) {
|
||||
int comparison = this.compareContigs(that);
|
||||
return comparison == 0 && this.getStart() == that.getStart();
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests whether any portion of this contig is before that contig.
|
||||
* @param that Other contig to test.
|
||||
|
|
@ -481,4 +495,14 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
|
|||
public long sizeOfOverlap( final GenomeLoc that ) {
|
||||
return ( this.overlapsP(that) ? Math.min( getStop(), that.getStop() ) - Math.max( getStart(), that.getStart() ) + 1L : 0L );
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the maximum GenomeLoc of this and other
|
||||
* @param other another non-null genome loc
|
||||
* @return the max of this and other
|
||||
*/
|
||||
public GenomeLoc max(final GenomeLoc other) {
|
||||
final int cmp = this.compareTo(other);
|
||||
return cmp == -1 ? other : this;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -43,6 +43,9 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
// our private storage for the GenomeLoc's
|
||||
private List<GenomeLoc> mArray = new ArrayList<GenomeLoc>();
|
||||
|
||||
// cache this to make overlap checking much more efficient
|
||||
private int previousOverlapSearchIndex = -1;
|
||||
|
||||
/** default constructor */
|
||||
public GenomeLocSortedSet(GenomeLocParser parser) {
|
||||
this.genomeLocParser = parser;
|
||||
|
|
@ -101,7 +104,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
* Return the number of bps before loc in the sorted set
|
||||
*
|
||||
* @param loc the location before which we are counting bases
|
||||
* @return
|
||||
* @return the number of base pairs over all previous intervals
|
||||
*/
|
||||
public long sizeBeforeLoc(GenomeLoc loc) {
|
||||
long s = 0;
|
||||
|
|
@ -110,7 +113,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
if ( e.isBefore(loc) )
|
||||
s += e.size();
|
||||
else if ( e.isPast(loc) )
|
||||
; // don't do anything
|
||||
break; // we are done
|
||||
else // loc is inside of s
|
||||
s += loc.getStart() - e.getStart();
|
||||
}
|
||||
|
|
@ -131,15 +134,43 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
* Determine if the given loc overlaps any loc in the sorted set
|
||||
*
|
||||
* @param loc the location to test
|
||||
* @return
|
||||
* @return trip if the location overlaps any loc
|
||||
*/
|
||||
public boolean overlaps(final GenomeLoc loc) {
|
||||
for(final GenomeLoc e : mArray) {
|
||||
if(e.overlapsP(loc)) {
|
||||
return true;
|
||||
}
|
||||
// edge condition
|
||||
if ( mArray.isEmpty() )
|
||||
return false;
|
||||
|
||||
// use the cached version first
|
||||
if ( previousOverlapSearchIndex != -1 && overlapsAtOrImmediatelyAfterCachedIndex(loc, true) )
|
||||
return true;
|
||||
|
||||
// update the cached index
|
||||
previousOverlapSearchIndex = Collections.binarySearch(mArray, loc);
|
||||
|
||||
// if it matches an interval exactly, we are done
|
||||
if ( previousOverlapSearchIndex >= 0 )
|
||||
return true;
|
||||
|
||||
// check whether it overlaps the interval before or after the insertion point
|
||||
previousOverlapSearchIndex = Math.max(0, -1 * previousOverlapSearchIndex - 2);
|
||||
return overlapsAtOrImmediatelyAfterCachedIndex(loc, false);
|
||||
}
|
||||
|
||||
private boolean overlapsAtOrImmediatelyAfterCachedIndex(final GenomeLoc loc, final boolean updateCachedIndex) {
|
||||
// check the cached entry
|
||||
if ( mArray.get(previousOverlapSearchIndex).overlapsP(loc) )
|
||||
return true;
|
||||
|
||||
// check the entry after the cached entry since we may have moved to it
|
||||
boolean returnValue = false;
|
||||
if ( previousOverlapSearchIndex < mArray.size() - 1 ) {
|
||||
returnValue = mArray.get(previousOverlapSearchIndex + 1).overlapsP(loc);
|
||||
if ( updateCachedIndex )
|
||||
previousOverlapSearchIndex++;
|
||||
}
|
||||
return false;
|
||||
|
||||
return returnValue;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -155,7 +186,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
mArray.add(e);
|
||||
return true;
|
||||
} else {
|
||||
int loc = Collections.binarySearch(mArray,e);
|
||||
final int loc = Collections.binarySearch(mArray,e);
|
||||
if (loc >= 0) {
|
||||
throw new ReviewedStingException("Genome Loc Sorted Set already contains the GenomicLoc " + e.toString());
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -41,15 +41,15 @@ public class Haplotype {
|
|||
protected final byte[] bases;
|
||||
protected final double[] quals;
|
||||
private GenomeLoc genomeLocation = null;
|
||||
private HashMap<String, double[]> readLikelihoodsPerSample = null;
|
||||
private HashMap<String, int[]> readCountsPerSample = null;
|
||||
private HashMap<Integer, VariantContext> eventMap = null;
|
||||
private boolean isRef = false;
|
||||
private Cigar cigar;
|
||||
private int alignmentStartHapwrtRef;
|
||||
public int leftBreakPoint = 0;
|
||||
public int rightBreakPoint = 0;
|
||||
|
||||
private Allele artificialAllele = null;
|
||||
private int artificialAllelePosition = -1;
|
||||
|
||||
/**
|
||||
* Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual
|
||||
*
|
||||
|
|
@ -71,6 +71,12 @@ public class Haplotype {
|
|||
this(bases, 0);
|
||||
}
|
||||
|
||||
protected Haplotype( final byte[] bases, final Allele artificialAllele, final int artificialAllelePosition ) {
|
||||
this(bases, 0);
|
||||
this.artificialAllele = artificialAllele;
|
||||
this.artificialAllelePosition = artificialAllelePosition;
|
||||
}
|
||||
|
||||
public Haplotype( final byte[] bases, final GenomeLoc loc ) {
|
||||
this(bases);
|
||||
this.genomeLocation = loc;
|
||||
|
|
@ -86,31 +92,6 @@ public class Haplotype {
|
|||
return Arrays.hashCode(bases);
|
||||
}
|
||||
|
||||
public void addReadLikelihoods( final String sample, final double[] readLikelihoods, final int[] readCounts ) {
|
||||
if( readLikelihoodsPerSample == null ) {
|
||||
readLikelihoodsPerSample = new HashMap<String, double[]>();
|
||||
}
|
||||
readLikelihoodsPerSample.put(sample, readLikelihoods);
|
||||
if( readCountsPerSample == null ) {
|
||||
readCountsPerSample = new HashMap<String, int[]>();
|
||||
}
|
||||
readCountsPerSample.put(sample, readCounts);
|
||||
}
|
||||
|
||||
@Ensures({"result != null"})
|
||||
public double[] getReadLikelihoods( final String sample ) {
|
||||
return readLikelihoodsPerSample.get(sample);
|
||||
}
|
||||
|
||||
@Ensures({"result != null"})
|
||||
public int[] getReadCounts( final String sample ) {
|
||||
return readCountsPerSample.get(sample);
|
||||
}
|
||||
|
||||
public Set<String> getSampleKeySet() {
|
||||
return readLikelihoodsPerSample.keySet();
|
||||
}
|
||||
|
||||
public HashMap<Integer, VariantContext> getEventMap() {
|
||||
return eventMap;
|
||||
}
|
||||
|
|
@ -171,8 +152,25 @@ public class Haplotype {
|
|||
this.cigar = cigar;
|
||||
}
|
||||
|
||||
public boolean isArtificialHaplotype() {
|
||||
return artificialAllele != null;
|
||||
}
|
||||
|
||||
public Allele getArtificialAllele() {
|
||||
return artificialAllele;
|
||||
}
|
||||
|
||||
public int getArtificialAllelePosition() {
|
||||
return artificialAllelePosition;
|
||||
}
|
||||
|
||||
public void setArtificialAllele(final Allele artificialAllele, final int artificialAllelePosition) {
|
||||
this.artificialAllele = artificialAllele;
|
||||
this.artificialAllelePosition = artificialAllelePosition;
|
||||
}
|
||||
|
||||
@Requires({"refInsertLocation >= 0"})
|
||||
public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation ) {
|
||||
public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation, final int genomicInsertLocation ) {
|
||||
// refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates
|
||||
final int haplotypeInsertLocation = ReadUtils.getReadCoordinateForReferenceCoordinate(alignmentStartHapwrtRef, cigar, refInsertLocation, ReadUtils.ClippingTail.RIGHT_TAIL, true);
|
||||
if( haplotypeInsertLocation == -1 || haplotypeInsertLocation + refAllele.length() >= bases.length ) { // desired change falls inside deletion so don't bother creating a new haplotype
|
||||
|
|
@ -182,7 +180,7 @@ public class Haplotype {
|
|||
newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, 0, haplotypeInsertLocation)); // bases before the variant
|
||||
newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant
|
||||
newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, haplotypeInsertLocation + refAllele.length(), bases.length)); // bases after the variant
|
||||
return new Haplotype(newHaplotypeBases);
|
||||
return new Haplotype(newHaplotypeBases, altAllele, genomicInsertLocation);
|
||||
}
|
||||
|
||||
public static class HaplotypeBaseComparator implements Comparator<Haplotype>, Serializable {
|
||||
|
|
|
|||
|
|
@ -8,6 +8,8 @@ import java.util.Collection;
|
|||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* THIS IMPLEMENTATION IS BROKEN AND WILL BE REMOVED ONCE THE DOWNSAMPLING ENGINE FORK COLLAPSES
|
||||
*
|
||||
* Randomly downsample from a stream of elements. This algorithm is a direct,
|
||||
* naive implementation of reservoir downsampling as described in "Random Downsampling
|
||||
* with a Reservoir" (Vitter 1985). At time of writing, this paper is located here:
|
||||
|
|
@ -16,7 +18,7 @@ import java.util.Iterator;
|
|||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class ReservoirDownsampler<T> {
|
||||
public class LegacyReservoirDownsampler<T> {
|
||||
/**
|
||||
* The reservoir of elements tracked by this downsampler.
|
||||
*/
|
||||
|
|
@ -31,7 +33,7 @@ public class ReservoirDownsampler<T> {
|
|||
* Create a new downsampler with the given source iterator and given comparator.
|
||||
* @param maxElements What is the maximum number of reads that can be returned in any call of this
|
||||
*/
|
||||
public ReservoirDownsampler(final int maxElements) {
|
||||
public LegacyReservoirDownsampler(final int maxElements) {
|
||||
if(maxElements < 0)
|
||||
throw new ReviewedStingException("Unable to work with an negative size collection of elements");
|
||||
this.reservoir = new ArrayList<T>(maxElements);
|
||||
|
|
@ -14,7 +14,7 @@ public class QualityUtils {
|
|||
public final static double ERROR_RATE_OF_MAX_QUAL_SCORE = qualToErrorProbRaw(MAX_QUAL_SCORE);
|
||||
|
||||
public final static double MIN_REASONABLE_ERROR = 0.0001;
|
||||
public final static byte MAX_REASONABLE_Q_SCORE = 40;
|
||||
public final static byte MAX_REASONABLE_Q_SCORE = 60; // quals above this value are extremely suspicious
|
||||
public final static byte MIN_USABLE_Q_SCORE = 6;
|
||||
public final static int MAPPING_QUALITY_UNAVAILABLE = 255;
|
||||
|
||||
|
|
|
|||
|
|
@ -293,6 +293,10 @@ public class Utils {
|
|||
}
|
||||
}
|
||||
|
||||
public static <T> String join(final String separator, final T ... objects) {
|
||||
return join(separator, Arrays.asList(objects));
|
||||
}
|
||||
|
||||
public static String dupString(char c, int nCopies) {
|
||||
char[] chars = new char[nCopies];
|
||||
Arrays.fill(chars, c);
|
||||
|
|
@ -687,23 +691,71 @@ public class Utils {
|
|||
array[i] = value;
|
||||
}
|
||||
|
||||
public static void setupWriter(StingSAMFileWriter writer, GenomeAnalysisEngine toolkit, boolean preSorted, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) {
|
||||
final SAMProgramRecord programRecord = createProgramRecord(toolkit, walker, PROGRAM_RECORD_NAME);
|
||||
|
||||
SAMFileHeader header = toolkit.getSAMFileHeader();
|
||||
/**
|
||||
* Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and sets
|
||||
* up the writer with the header and presorted status.
|
||||
*
|
||||
* @param toolkit the engine
|
||||
* @param originalHeader original header
|
||||
* @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file
|
||||
* @param programRecord the program record for this program
|
||||
*/
|
||||
public static SAMFileHeader setupWriter(GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean KEEP_ALL_PG_RECORDS, SAMProgramRecord programRecord) {
|
||||
SAMFileHeader header = originalHeader.clone();
|
||||
List<SAMProgramRecord> oldRecords = header.getProgramRecords();
|
||||
List<SAMProgramRecord> newRecords = new ArrayList<SAMProgramRecord>(oldRecords.size()+1);
|
||||
for ( SAMProgramRecord record : oldRecords )
|
||||
if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) || KEEP_ALL_PG_RECORDS )
|
||||
if ( (programRecord != null && !record.getId().startsWith(programRecord.getId())) || KEEP_ALL_PG_RECORDS )
|
||||
newRecords.add(record);
|
||||
|
||||
newRecords.add(programRecord);
|
||||
header.setProgramRecords(newRecords);
|
||||
if (programRecord != null) {
|
||||
newRecords.add(programRecord);
|
||||
header.setProgramRecords(newRecords);
|
||||
}
|
||||
return header;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and returns
|
||||
* the new header to be added to the BAM writer.
|
||||
*
|
||||
* @param toolkit the engine
|
||||
* @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file
|
||||
* @param walker the walker object (so we can extract the command line)
|
||||
* @param PROGRAM_RECORD_NAME the name for the PG tag
|
||||
* @return a pre-filled header for the bam writer
|
||||
*/
|
||||
public static SAMFileHeader setupWriter(GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) {
|
||||
final SAMProgramRecord programRecord = createProgramRecord(toolkit, walker, PROGRAM_RECORD_NAME);
|
||||
return setupWriter(toolkit, originalHeader, KEEP_ALL_PG_RECORDS, programRecord);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and sets
|
||||
* up the writer with the header and presorted status.
|
||||
*
|
||||
* @param writer BAM file writer
|
||||
* @param toolkit the engine
|
||||
* @param preSorted whether or not the writer can assume reads are going to be added are already sorted
|
||||
* @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file
|
||||
* @param walker the walker object (so we can extract the command line)
|
||||
* @param PROGRAM_RECORD_NAME the name for the PG tag
|
||||
*/
|
||||
public static void setupWriter(StingSAMFileWriter writer, GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean preSorted, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) {
|
||||
SAMFileHeader header = setupWriter(toolkit, originalHeader, KEEP_ALL_PG_RECORDS, walker, PROGRAM_RECORD_NAME);
|
||||
writer.writeHeader(header);
|
||||
writer.setPresorted(preSorted);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Creates a program record (@PG) tag
|
||||
*
|
||||
* @param toolkit the engine
|
||||
* @param walker the walker object (so we can extract the command line)
|
||||
* @param PROGRAM_RECORD_NAME the name for the PG tag
|
||||
* @return a program record for the tool
|
||||
*/
|
||||
public static SAMProgramRecord createProgramRecord(GenomeAnalysisEngine toolkit, Object walker, String PROGRAM_RECORD_NAME) {
|
||||
final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME);
|
||||
final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText");
|
||||
|
|
@ -858,4 +910,5 @@ public class Utils {
|
|||
}
|
||||
return subLists;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
package org.broadinstitute.sting.utils.activeregion;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.HasGenomeLocation;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
|
@ -54,27 +54,31 @@ public class ActiveRegion implements HasGenomeLocation {
|
|||
|
||||
public ArrayList<GATKSAMRecord> getReads() { return reads; }
|
||||
|
||||
public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader ) {
|
||||
@Requires("referenceReader.isUppercasingBases()")
|
||||
public byte[] getActiveRegionReference( final CachingIndexedFastaSequenceFile referenceReader ) {
|
||||
return getActiveRegionReference(referenceReader, 0);
|
||||
}
|
||||
|
||||
public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader, final int padding ) {
|
||||
@Requires("referenceReader.isUppercasingBases()")
|
||||
public byte[] getActiveRegionReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding ) {
|
||||
return getReference( referenceReader, padding, extendedLoc );
|
||||
}
|
||||
|
||||
public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) {
|
||||
@Requires("referenceReader.isUppercasingBases()")
|
||||
public byte[] getFullReference( final CachingIndexedFastaSequenceFile referenceReader ) {
|
||||
return getFullReference(referenceReader, 0);
|
||||
}
|
||||
|
||||
public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) {
|
||||
@Requires("referenceReader.isUppercasingBases()")
|
||||
public byte[] getFullReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding ) {
|
||||
return getReference( referenceReader, padding, fullExtentReferenceLoc );
|
||||
}
|
||||
|
||||
private byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) {
|
||||
@Requires("referenceReader.isUppercasingBases()")
|
||||
private byte[] getReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) {
|
||||
final byte[] reference = referenceReader.getSubsequenceAt( genomeLoc.getContig(),
|
||||
Math.max(1, genomeLoc.getStart() - padding),
|
||||
Math.min(referenceReader.getSequenceDictionary().getSequence(genomeLoc.getContig()).getSequenceLength(), genomeLoc.getStop() + padding) ).getBases();
|
||||
StringUtil.toUpperCase(reference);
|
||||
return reference;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,16 @@
|
|||
package org.broadinstitute.sting.utils.activeregion;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: thibault
|
||||
* Date: 11/26/12
|
||||
* Time: 2:35 PM
|
||||
*
|
||||
* Describes how a read relates to an assigned ActiveRegion
|
||||
*/
|
||||
public enum ActiveRegionReadState {
|
||||
PRIMARY, // This is the read's primary region
|
||||
NONPRIMARY, // This region overlaps the read, but it is not primary
|
||||
EXTENDED, // This region would overlap the read if it were extended
|
||||
UNMAPPED // This read is not mapped
|
||||
}
|
||||
|
|
@ -24,11 +24,11 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.activeregion;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
|
|
@ -45,6 +45,7 @@ public class ActivityProfile {
|
|||
final GenomeLocParser parser;
|
||||
final boolean presetRegions;
|
||||
GenomeLoc regionStartLoc = null;
|
||||
GenomeLoc regionStopLoc = null;
|
||||
final List<ActivityProfileResult> isActiveList;
|
||||
private static final int FILTER_SIZE = 80;
|
||||
private static final double[] GaussianKernel;
|
||||
|
|
@ -71,19 +72,49 @@ public class ActivityProfile {
|
|||
this.regionStartLoc = regionStartLoc;
|
||||
}
|
||||
|
||||
public void add(final GenomeLoc loc, final ActivityProfileResult result) {
|
||||
if ( loc.size() != 1 )
|
||||
throw new ReviewedStingException("Bad add call to ActivityProfile: loc " + loc + " size != 1" );
|
||||
isActiveList.add(result);
|
||||
if( regionStartLoc == null ) {
|
||||
@Override
|
||||
public String toString() {
|
||||
return "ActivityProfile{" +
|
||||
"start=" + regionStartLoc +
|
||||
", stop=" + regionStopLoc +
|
||||
'}';
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the next ActivityProfileResult to this profile.
|
||||
*
|
||||
* Must be contiguous with the previously added result, or an IllegalArgumentException will be thrown
|
||||
*
|
||||
* @param result a well-formed ActivityProfileResult result to incorporate into this profile
|
||||
*/
|
||||
@Requires("result != null")
|
||||
public void add(final ActivityProfileResult result) {
|
||||
final GenomeLoc loc = result.getLoc();
|
||||
|
||||
if ( regionStartLoc == null ) {
|
||||
regionStartLoc = loc;
|
||||
regionStopLoc = loc;
|
||||
} else {
|
||||
if ( regionStopLoc.getStart() != loc.getStart() - 1 )
|
||||
throw new IllegalArgumentException("Bad add call to ActivityProfile: loc " + loc + " not immediate after last loc " + regionStopLoc );
|
||||
regionStopLoc = loc;
|
||||
}
|
||||
|
||||
isActiveList.add(result);
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return isActiveList.size();
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return isActiveList.isEmpty();
|
||||
}
|
||||
|
||||
public boolean hasPresetRegions() {
|
||||
return presetRegions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Band pass this ActivityProfile, producing a new profile that's band pass filtered
|
||||
* @return a new ActivityProfile that's the band-pass filtered version of this profile
|
||||
|
|
@ -104,14 +135,21 @@ public class ActivityProfile {
|
|||
}
|
||||
iii++;
|
||||
}
|
||||
final double[] filteredProbArray = new double[activeProbArray.length];
|
||||
|
||||
final double[] filteredProbArray;
|
||||
if( !presetRegions ) {
|
||||
// if we aren't using preset regions, actually apply the band pass filter for activeProbArray into filteredProbArray
|
||||
filteredProbArray = new double[activeProbArray.length];
|
||||
for( iii = 0; iii < activeProbArray.length; iii++ ) {
|
||||
final double[] kernel = ArrayUtils.subarray(GaussianKernel, Math.max(FILTER_SIZE-iii, 0), Math.min(GaussianKernel.length,FILTER_SIZE + activeProbArray.length - iii));
|
||||
final double[] activeProbSubArray = ArrayUtils.subarray(activeProbArray, Math.max(0,iii - FILTER_SIZE), Math.min(activeProbArray.length,iii + FILTER_SIZE + 1));
|
||||
filteredProbArray[iii] = MathUtils.dotProduct(activeProbSubArray, kernel);
|
||||
}
|
||||
} else {
|
||||
// otherwise we simply use the activeProbArray directly
|
||||
filteredProbArray = activeProbArray;
|
||||
}
|
||||
|
||||
iii = 0;
|
||||
for( final double prob : filteredProbArray ) {
|
||||
final ActivityProfileResult result = isActiveList.get(iii++);
|
||||
|
|
@ -119,6 +157,7 @@ public class ActivityProfile {
|
|||
result.resultState = ActivityProfileResult.ActivityProfileResultState.NONE;
|
||||
result.resultValue = null;
|
||||
}
|
||||
|
||||
return new ActivityProfile(parser, presetRegions, isActiveList, regionStartLoc);
|
||||
}
|
||||
|
||||
|
|
@ -166,6 +205,7 @@ public class ActivityProfile {
|
|||
private final List<ActiveRegion> createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize) {
|
||||
return createActiveRegion(isActive, curStart, curEnd, activeRegionExtension, maxRegionSize, new ArrayList<ActiveRegion>());
|
||||
}
|
||||
|
||||
private final List<ActiveRegion> createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize, final List<ActiveRegion> returnList) {
|
||||
if( !isActive || curEnd - curStart < maxRegionSize ) {
|
||||
final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd);
|
||||
|
|
|
|||
|
|
@ -1,12 +1,16 @@
|
|||
package org.broadinstitute.sting.utils.activeregion;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 7/27/12
|
||||
*/
|
||||
|
||||
public class ActivityProfileResult {
|
||||
private GenomeLoc loc;
|
||||
public double isActiveProb;
|
||||
public ActivityProfileResultState resultState;
|
||||
public Number resultValue;
|
||||
|
|
@ -16,16 +20,52 @@ public class ActivityProfileResult {
|
|||
HIGH_QUALITY_SOFT_CLIPS
|
||||
}
|
||||
|
||||
public ActivityProfileResult( final double isActiveProb ) {
|
||||
this.isActiveProb = isActiveProb;
|
||||
this.resultState = ActivityProfileResultState.NONE;
|
||||
this.resultValue = null;
|
||||
/**
|
||||
* Create a new ActivityProfileResult at loc with probability of being active of isActiveProb
|
||||
*
|
||||
* @param loc the position of the result profile (for debugging purposes)
|
||||
* @param isActiveProb the probability of being active (between 0 and 1)
|
||||
*/
|
||||
@Requires({"loc != null", "isActiveProb >= 0.0 && isActiveProb <= 1.0"})
|
||||
public ActivityProfileResult( final GenomeLoc loc, final double isActiveProb ) {
|
||||
this(loc, isActiveProb, ActivityProfileResultState.NONE, null);
|
||||
}
|
||||
|
||||
public ActivityProfileResult( final double isActiveProb, final ActivityProfileResultState resultState, final Number resultValue ) {
|
||||
/**
|
||||
* Create a new ActivityProfileResult at loc with probability of being active of isActiveProb that maintains some
|
||||
* information about the result state and value (TODO RYAN -- what do these mean?)
|
||||
*
|
||||
* @param loc the position of the result profile (for debugging purposes)
|
||||
* @param isActiveProb the probability of being active (between 0 and 1)
|
||||
*/
|
||||
@Requires({"loc != null", "isActiveProb >= 0.0 && isActiveProb <= 1.0"})
|
||||
public ActivityProfileResult( final GenomeLoc loc, final double isActiveProb, final ActivityProfileResultState resultState, final Number resultValue ) {
|
||||
// make sure the location of that activity profile is 1
|
||||
if ( loc.size() != 1 )
|
||||
throw new IllegalArgumentException("Location for an ActivityProfileResult must have to size 1 bp but saw " + loc);
|
||||
|
||||
this.loc = loc;
|
||||
this.isActiveProb = isActiveProb;
|
||||
this.resultState = resultState;
|
||||
this.resultValue = resultValue;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the genome loc associated with the ActivityProfileResult
|
||||
* @return the location of this result
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public GenomeLoc getLoc() {
|
||||
return loc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "ActivityProfileResult{" +
|
||||
"loc=" + loc +
|
||||
", isActiveProb=" + isActiveProb +
|
||||
", resultState=" + resultState +
|
||||
", resultValue=" + resultValue +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -414,7 +414,7 @@ public class BAQ {
|
|||
throw new ReviewedStingException("BAQ tag calculation error. BAQ value above base quality at " + read);
|
||||
// the original quality is too high, almost certainly due to using the wrong encoding in the BAM file
|
||||
if ( tag > Byte.MAX_VALUE )
|
||||
throw new UserException.MalformedBAM(read, "we encountered an extremely high quality score (" + (bq - 64) + ") with BAQ correction factor of " + baq_i + "; the BAM file appears to be using the wrong encoding for quality scores");
|
||||
throw new UserException.MisencodedBAM(read, "we encountered an extremely high quality score (" + (int)read.getBaseQualities()[i] + ") with BAQ correction factor of " + baq_i);
|
||||
bqTag[i] = (byte)tag;
|
||||
}
|
||||
return new String(bqTag);
|
||||
|
|
|
|||
|
|
@ -101,7 +101,7 @@ public class PluginManager<PluginType> {
|
|||
* Create a new plugin manager.
|
||||
* @param pluginType Core type for a plugin.
|
||||
*/
|
||||
public PluginManager(Class<PluginType> pluginType) {
|
||||
public PluginManager(Class pluginType) {
|
||||
this(pluginType, pluginType.getSimpleName().toLowerCase(), pluginType.getSimpleName(), null);
|
||||
}
|
||||
|
||||
|
|
@ -110,7 +110,7 @@ public class PluginManager<PluginType> {
|
|||
* @param pluginType Core type for a plugin.
|
||||
* @param classpath Custom class path to search for classes.
|
||||
*/
|
||||
public PluginManager(Class<PluginType> pluginType, List<URL> classpath) {
|
||||
public PluginManager(Class pluginType, List<URL> classpath) {
|
||||
this(pluginType, pluginType.getSimpleName().toLowerCase(), pluginType.getSimpleName(), classpath);
|
||||
}
|
||||
|
||||
|
|
@ -120,7 +120,7 @@ public class PluginManager<PluginType> {
|
|||
* @param pluginCategory Provides a category name to the plugin. Must not be null.
|
||||
* @param pluginSuffix Provides a suffix that will be trimmed off when converting to a plugin name. Can be null.
|
||||
*/
|
||||
public PluginManager(Class<PluginType> pluginType, String pluginCategory, String pluginSuffix) {
|
||||
public PluginManager(Class pluginType, String pluginCategory, String pluginSuffix) {
|
||||
this(pluginType, pluginCategory, pluginSuffix, null);
|
||||
}
|
||||
|
||||
|
|
@ -131,7 +131,7 @@ public class PluginManager<PluginType> {
|
|||
* @param pluginSuffix Provides a suffix that will be trimmed off when converting to a plugin name. Can be null.
|
||||
* @param classpath Custom class path to search for classes.
|
||||
*/
|
||||
public PluginManager(Class<PluginType> pluginType, String pluginCategory, String pluginSuffix, List<URL> classpath) {
|
||||
public PluginManager(Class pluginType, String pluginCategory, String pluginSuffix, List<URL> classpath) {
|
||||
this.pluginCategory = pluginCategory;
|
||||
this.pluginSuffix = pluginSuffix;
|
||||
|
||||
|
|
@ -149,6 +149,7 @@ public class PluginManager<PluginType> {
|
|||
}
|
||||
|
||||
// Load all classes types filtering them by concrete.
|
||||
@SuppressWarnings("unchecked")
|
||||
Set<Class<? extends PluginType>> allTypes = reflections.getSubTypesOf(pluginType);
|
||||
for( Class<? extends PluginType> type: allTypes ) {
|
||||
// The plugin manager does not support anonymous classes; to be a plugin, a class must have a name.
|
||||
|
|
@ -325,7 +326,7 @@ public class PluginManager<PluginType> {
|
|||
* @param pluginType The type of plugin.
|
||||
* @return A name for this type of plugin.
|
||||
*/
|
||||
public String getName(Class<? extends PluginType> pluginType) {
|
||||
public String getName(Class pluginType) {
|
||||
String pluginName = "";
|
||||
|
||||
if (pluginName.length() == 0) {
|
||||
|
|
|
|||
|
|
@ -587,7 +587,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
|
|||
|
||||
int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR);
|
||||
if ( nParts != genotypeParts.length )
|
||||
generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo);
|
||||
generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records at " + chr + ":" + pos, lineNo);
|
||||
|
||||
ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nParts);
|
||||
|
||||
|
|
|
|||
|
|
@ -30,12 +30,17 @@ import net.sf.samtools.SAMSequenceRecord;
|
|||
import org.apache.commons.io.FilenameUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.FeatureCodecHeader;
|
||||
import org.broad.tribble.readers.PositionalBufferedStream;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -317,4 +322,33 @@ public class VCFUtils {
|
|||
assembly = "hg19";
|
||||
return assembly;
|
||||
}
|
||||
|
||||
/**
|
||||
* Read all of the VCF records from source into memory, returning the header and the VariantContexts
|
||||
*
|
||||
* @param source the file to read, must be in VCF4 format
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
public static Pair<VCFHeader, List<VariantContext>> readVCF(final File source) throws IOException {
|
||||
// read in the features
|
||||
final List<VariantContext> vcs = new ArrayList<VariantContext>();
|
||||
final VCFCodec codec = new VCFCodec();
|
||||
PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source));
|
||||
FeatureCodecHeader header = codec.readHeader(pbs);
|
||||
pbs.close();
|
||||
|
||||
pbs = new PositionalBufferedStream(new FileInputStream(source));
|
||||
pbs.skip(header.getHeaderEnd());
|
||||
|
||||
final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue();
|
||||
|
||||
while ( ! pbs.isDone() ) {
|
||||
final VariantContext vc = codec.decode(pbs);
|
||||
if ( vc != null )
|
||||
vcs.add(vc);
|
||||
}
|
||||
|
||||
return new Pair<VCFHeader, List<VariantContext>>(vcfHeader, vcs);
|
||||
}
|
||||
}
|
||||
|
|
@ -240,6 +240,16 @@ public class UserException extends ReviewedStingException {
|
|||
}
|
||||
}
|
||||
|
||||
public static class MisencodedBAM extends UserException {
|
||||
public MisencodedBAM(SAMRecord read, String message) {
|
||||
this(read.getFileSource() != null ? read.getFileSource().getReader().toString() : "(none)", message);
|
||||
}
|
||||
|
||||
public MisencodedBAM(String source, String message) {
|
||||
super(String.format("SAM/BAM file %s appears to be using the wrong encoding for quality scores: %s; please see the GATK --help documentation for options related to this error", source, message));
|
||||
}
|
||||
}
|
||||
|
||||
public static class MalformedVCF extends UserException {
|
||||
public MalformedVCF(String message, String line) {
|
||||
super(String.format("The provided VCF file is malformed at line %s: %s", line, message));
|
||||
|
|
@ -268,7 +278,7 @@ public class UserException extends ReviewedStingException {
|
|||
|
||||
public static class ReadMissingReadGroup extends MalformedBAM {
|
||||
public ReadMissingReadGroup(SAMRecord read) {
|
||||
super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK. Please use " + HelpUtils.GATK_FORUM_URL + "discussion/59/companion-utilities-replacereadgroups to fix this problem", read.getReadName()));
|
||||
super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK. Please use " + HelpUtils.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName()));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -344,7 +354,7 @@ public class UserException extends ReviewedStingException {
|
|||
super(String.format("Lexicographically sorted human genome sequence detected in %s."
|
||||
+ "\nFor safety's sake the GATK requires human contigs in karyotypic order: 1, 2, ..., 10, 11, ..., 20, 21, 22, X, Y with M either leading or trailing these contigs."
|
||||
+ "\nThis is because all distributed GATK resources are sorted in karyotypic order, and your processing will fail when you need to use these files."
|
||||
+ "\nYou can use the ReorderSam utility to fix this problem: " + HelpUtils.GATK_FORUM_URL + "discussion/58/companion-utilities-reordersam"
|
||||
+ "\nYou can use the ReorderSam utility to fix this problem: " + HelpUtils.forumPost("discussion/58/companion-utilities-reordersam")
|
||||
+ "\n %s contigs = %s",
|
||||
name, name, ReadUtils.prettyPrintSequenceRecords(dict)));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@ import net.sf.picard.reference.FastaSequenceIndex;
|
|||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import org.apache.log4j.Priority;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
|
|
@ -40,6 +41,8 @@ import java.util.Arrays;
|
|||
* A caching version of the IndexedFastaSequenceFile that avoids going to disk as often as the raw indexer.
|
||||
*
|
||||
* Thread-safe! Uses a thread-local cache
|
||||
*
|
||||
* Automatically upper-cases the bases coming in, unless they the flag preserveCase is explicitly set
|
||||
*/
|
||||
public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
||||
protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(CachingIndexedFastaSequenceFile.class);
|
||||
|
|
@ -54,10 +57,15 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
|||
public static final long DEFAULT_CACHE_SIZE = 1000000;
|
||||
|
||||
/** The cache size of this CachingIndexedFastaSequenceFile */
|
||||
final long cacheSize;
|
||||
private final long cacheSize;
|
||||
|
||||
/** When we have a cache miss at position X, we load sequence from X - cacheMissBackup */
|
||||
final long cacheMissBackup;
|
||||
private final long cacheMissBackup;
|
||||
|
||||
/**
|
||||
* If true, we will preserve the case of the original base in the genome, not
|
||||
*/
|
||||
private final boolean preserveCase;
|
||||
|
||||
// information about checking efficiency
|
||||
long cacheHits = 0;
|
||||
|
|
@ -84,37 +92,17 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
|||
/**
|
||||
* Same as general constructor but allows one to override the default cacheSize
|
||||
*
|
||||
* @param fasta
|
||||
* @param index
|
||||
* @param cacheSize
|
||||
* @param fasta the file we will read our FASTA sequence from.
|
||||
* @param index the index of the fasta file, used for efficient random access
|
||||
* @param cacheSize the size in bp of the cache we will use for this reader
|
||||
* @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case
|
||||
*/
|
||||
public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize) {
|
||||
public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize, final boolean preserveCase) {
|
||||
super(fasta, index);
|
||||
if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0");
|
||||
this.cacheSize = cacheSize;
|
||||
this.cacheMissBackup = Math.max(cacheSize / 1000, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
|
||||
*
|
||||
* @param fasta The file to open.
|
||||
* @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk.
|
||||
* @throws java.io.FileNotFoundException If the fasta or any of its supporting files cannot be found.
|
||||
*/
|
||||
public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index) {
|
||||
this(fasta, index, DEFAULT_CACHE_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
|
||||
*
|
||||
* Looks for a index file for fasta on disk
|
||||
*
|
||||
* @param fasta The file to open.
|
||||
*/
|
||||
public CachingIndexedFastaSequenceFile(final File fasta) throws FileNotFoundException {
|
||||
this(fasta, DEFAULT_CACHE_SIZE);
|
||||
this.preserveCase = preserveCase;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -124,12 +112,76 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
|||
* Uses provided cacheSize instead of the default
|
||||
*
|
||||
* @param fasta The file to open.
|
||||
* @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0
|
||||
* @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case
|
||||
*/
|
||||
public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize ) throws FileNotFoundException {
|
||||
public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize, final boolean preserveCase ) throws FileNotFoundException {
|
||||
super(fasta);
|
||||
if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0");
|
||||
this.cacheSize = cacheSize;
|
||||
this.cacheMissBackup = Math.max(cacheSize / 1000, 1);
|
||||
this.preserveCase = preserveCase;
|
||||
}
|
||||
|
||||
// /**
|
||||
// * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
|
||||
// *
|
||||
// * @param fasta The file to open.
|
||||
// * @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk.
|
||||
// * @throws java.io.FileNotFoundException If the fasta or any of its supporting files cannot be found.
|
||||
// */
|
||||
// public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index) {
|
||||
// this(fasta, index, DEFAULT_CACHE_SIZE);
|
||||
// }
|
||||
|
||||
/**
|
||||
* Same as general constructor but allows one to override the default cacheSize
|
||||
*
|
||||
* By default, this CachingIndexedFastaReader converts all incoming bases to upper case
|
||||
*
|
||||
* @param fasta the file we will read our FASTA sequence from.
|
||||
* @param index the index of the fasta file, used for efficient random access
|
||||
* @param cacheSize the size in bp of the cache we will use for this reader
|
||||
*/
|
||||
public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize) {
|
||||
this(fasta, index, cacheSize, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
|
||||
*
|
||||
* Looks for a index file for fasta on disk.
|
||||
* This CachingIndexedFastaReader will convert all FASTA bases to upper cases under the hood
|
||||
*
|
||||
* @param fasta The file to open.
|
||||
*/
|
||||
public CachingIndexedFastaSequenceFile(final File fasta) throws FileNotFoundException {
|
||||
this(fasta, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
|
||||
*
|
||||
* Looks for a index file for fasta on disk
|
||||
*
|
||||
* @param fasta The file to open.
|
||||
* @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case
|
||||
*/
|
||||
public CachingIndexedFastaSequenceFile(final File fasta, final boolean preserveCase) throws FileNotFoundException {
|
||||
this(fasta, DEFAULT_CACHE_SIZE, preserveCase);
|
||||
}
|
||||
|
||||
/**
|
||||
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
|
||||
*
|
||||
* Looks for a index file for fasta on disk
|
||||
* Uses provided cacheSize instead of the default
|
||||
*
|
||||
* @param fasta The file to open.
|
||||
* @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0
|
||||
*/
|
||||
public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize ) throws FileNotFoundException {
|
||||
this(fasta, cacheSize, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -168,6 +220,25 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
|||
return cacheSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is this CachingIndexedFastaReader keeping the original case of bases in the fasta, or is
|
||||
* everything being made upper case?
|
||||
*
|
||||
* @return true if the bases coming from this reader are in the original case in the fasta, false if they are all upper cased
|
||||
*/
|
||||
public boolean isPreservingCase() {
|
||||
return preserveCase;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is uppercasing bases?
|
||||
*
|
||||
* @return true if bases coming from this CachingIndexedFastaSequenceFile are all upper cased, false if this reader are in the original case in the fasta
|
||||
*/
|
||||
public boolean isUppercasingBases() {
|
||||
return ! isPreservingCase();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the subsequence of the contig in the range [start,stop]
|
||||
*
|
||||
|
|
@ -177,8 +248,10 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
|||
* @param contig Contig whose subsequence to retrieve.
|
||||
* @param start inclusive, 1-based start of region.
|
||||
* @param stop inclusive, 1-based stop of region.
|
||||
* @return The partial reference sequence associated with this range.
|
||||
* @return The partial reference sequence associated with this range. If preserveCase is false, then
|
||||
* all of the bases in the ReferenceSequence returned by this method will be upper cased.
|
||||
*/
|
||||
@Override
|
||||
public ReferenceSequence getSubsequenceAt( final String contig, final long start, final long stop ) {
|
||||
final ReferenceSequence result;
|
||||
final Cache myCache = cache.get();
|
||||
|
|
@ -186,6 +259,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
|||
if ( (stop - start) >= cacheSize ) {
|
||||
cacheMisses++;
|
||||
result = super.getSubsequenceAt(contig, start, stop);
|
||||
if ( ! preserveCase ) StringUtil.toUpperCase(result.getBases());
|
||||
} else {
|
||||
// todo -- potential optimization is to check if contig.name == contig, as this in generally will be true
|
||||
SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig);
|
||||
|
|
@ -198,7 +272,9 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
|||
myCache.start = Math.max(start - cacheMissBackup, 0);
|
||||
myCache.stop = Math.min(start + cacheSize + cacheMissBackup, contigInfo.getSequenceLength());
|
||||
myCache.seq = super.getSubsequenceAt(contig, myCache.start, myCache.stop);
|
||||
//System.out.printf("New cache at %s %d-%d%n", contig, cacheStart, cacheStop);
|
||||
|
||||
// convert all of the bases in the sequence to upper case if we aren't preserving cases
|
||||
if ( ! preserveCase ) StringUtil.toUpperCase(myCache.seq.getBases());
|
||||
} else {
|
||||
cacheHits++;
|
||||
}
|
||||
|
|
@ -215,8 +291,10 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
|||
}
|
||||
}
|
||||
|
||||
// for debugging -- print out our efficiency if requested
|
||||
if ( PRINT_EFFICIENCY && (getCacheHits() + getCacheMisses()) % PRINT_FREQUENCY == 0 )
|
||||
printEfficiency(Priority.INFO);
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
@ -38,10 +38,10 @@ import java.util.*;
|
|||
|
||||
public abstract class PerReadAlleleLikelihoodMap {
|
||||
|
||||
public static final double INDEL_LIKELIHOOD_THRESH = 0.1;
|
||||
public static final double INFORMATIVE_LIKELIHOOD_THRESHOLD = 0.1;
|
||||
|
||||
protected List<Allele> alleles;
|
||||
protected Map<GATKSAMRecord,Map<Allele,Double>> likelihoodReadMap;
|
||||
protected Map<GATKSAMRecord, Map<Allele, Double>> likelihoodReadMap;
|
||||
|
||||
public abstract void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log);
|
||||
public abstract ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log);
|
||||
|
|
@ -68,7 +68,7 @@ public abstract class PerReadAlleleLikelihoodMap {
|
|||
}
|
||||
|
||||
public void add(PileupElement p, Allele a, Double likelihood) {
|
||||
add(p.getRead(),a,likelihood);
|
||||
add(p.getRead(), a, likelihood);
|
||||
}
|
||||
|
||||
public boolean containsPileupElement(PileupElement p) {
|
||||
|
|
@ -120,7 +120,7 @@ public abstract class PerReadAlleleLikelihoodMap {
|
|||
prevMaxLike = el.getValue();
|
||||
}
|
||||
}
|
||||
return (maxLike - prevMaxLike > INDEL_LIKELIHOOD_THRESH ? mostLikelyAllele : Allele.NO_CALL );
|
||||
return (maxLike - prevMaxLike > INFORMATIVE_LIKELIHOOD_THRESHOLD ? mostLikelyAllele : Allele.NO_CALL );
|
||||
}
|
||||
|
||||
public static PerReadAlleleLikelihoodMap getBestAvailablePerReadAlleleLikelihoodMap() {
|
||||
|
|
|
|||
|
|
@ -38,8 +38,9 @@ public class HelpUtils {
|
|||
public final static String GATK_FORUM_URL = "http://gatkforums.broadinstitute.org/";
|
||||
public final static String GATK_FORUM_API_URL = "https://gatkforums.broadinstitute.org/api/v1/";
|
||||
|
||||
|
||||
|
||||
public static String forumPost(String post) {
|
||||
return GATK_FORUM_URL + post;
|
||||
}
|
||||
|
||||
protected static boolean assignableToClass(ProgramElementDoc classDoc, Class lhsClass, boolean requireConcrete) {
|
||||
try {
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils.nanoScheduler;
|
|||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.MultiThreadedErrorTracker;
|
||||
import org.broadinstitute.sting.utils.SimpleTimer;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
|
|
@ -19,11 +18,6 @@ class InputProducer<InputType> implements Runnable {
|
|||
*/
|
||||
final Iterator<InputType> inputReader;
|
||||
|
||||
/**
|
||||
* Our timer (may be null) that we use to track our input costs
|
||||
*/
|
||||
final SimpleTimer inputTimer;
|
||||
|
||||
/**
|
||||
* Where we put our input values for consumption
|
||||
*/
|
||||
|
|
@ -51,16 +45,13 @@ class InputProducer<InputType> implements Runnable {
|
|||
|
||||
public InputProducer(final Iterator<InputType> inputReader,
|
||||
final MultiThreadedErrorTracker errorTracker,
|
||||
final SimpleTimer inputTimer,
|
||||
final BlockingQueue<InputValue> outputQueue) {
|
||||
if ( inputReader == null ) throw new IllegalArgumentException("inputReader cannot be null");
|
||||
if ( errorTracker == null ) throw new IllegalArgumentException("errorTracker cannot be null");
|
||||
if ( inputTimer == null ) throw new IllegalArgumentException("inputTimer cannot be null");
|
||||
if ( outputQueue == null ) throw new IllegalArgumentException("OutputQueue cannot be null");
|
||||
|
||||
this.inputReader = inputReader;
|
||||
this.errorTracker = errorTracker;
|
||||
this.inputTimer = inputTimer;
|
||||
this.outputQueue = outputQueue;
|
||||
}
|
||||
|
||||
|
|
@ -94,16 +85,15 @@ class InputProducer<InputType> implements Runnable {
|
|||
* @throws InterruptedException
|
||||
*/
|
||||
private synchronized InputType readNextItem() throws InterruptedException {
|
||||
inputTimer.restart();
|
||||
if ( ! inputReader.hasNext() ) {
|
||||
// we are done, mark ourselves as such and return null
|
||||
readLastValue = true;
|
||||
inputTimer.stop();
|
||||
return null;
|
||||
} else {
|
||||
// get the next value, and return it
|
||||
final InputType input = inputReader.next();
|
||||
inputTimer.stop();
|
||||
if ( input == null )
|
||||
throw new IllegalStateException("inputReader.next() returned a null value, breaking our contract");
|
||||
nRead++;
|
||||
return input;
|
||||
}
|
||||
|
|
@ -121,6 +111,9 @@ class InputProducer<InputType> implements Runnable {
|
|||
final InputType value = readNextItem();
|
||||
|
||||
if ( value == null ) {
|
||||
if ( ! readLastValue )
|
||||
throw new IllegalStateException("value == null but readLastValue is false!");
|
||||
|
||||
// add the EOF object so our consumer knows we are done in all inputs
|
||||
// note that we do not increase inputID here, so that variable indicates the ID
|
||||
// of the last real value read from the queue
|
||||
|
|
@ -133,8 +126,10 @@ class InputProducer<InputType> implements Runnable {
|
|||
}
|
||||
|
||||
latch.countDown();
|
||||
} catch (Exception ex) {
|
||||
} catch (Throwable ex) {
|
||||
errorTracker.notifyOfError(ex);
|
||||
} finally {
|
||||
// logger.info("Exiting input thread readLastValue = " + readLastValue);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,67 +0,0 @@
|
|||
package org.broadinstitute.sting.utils.nanoScheduler;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.AutoFormattingTime;
|
||||
import org.broadinstitute.sting.utils.SimpleTimer;
|
||||
|
||||
/**
|
||||
* Holds runtime profile (input, read, map) times as tracked by NanoScheduler
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 9/10/12
|
||||
* Time: 8:31 PM
|
||||
*/
|
||||
public class NSRuntimeProfile {
|
||||
final SimpleTimer outsideSchedulerTimer = new SimpleTimer("outside");
|
||||
final SimpleTimer inputTimer = new SimpleTimer("input");
|
||||
final SimpleTimer mapTimer = new SimpleTimer("map");
|
||||
final SimpleTimer reduceTimer = new SimpleTimer("reduce");
|
||||
|
||||
/**
|
||||
* Combine the elapsed time information from other with this profile
|
||||
*
|
||||
* @param other a non-null profile
|
||||
*/
|
||||
public void combine(final NSRuntimeProfile other) {
|
||||
outsideSchedulerTimer.addElapsed(other.outsideSchedulerTimer);
|
||||
inputTimer.addElapsed(other.inputTimer);
|
||||
mapTimer.addElapsed(other.mapTimer);
|
||||
reduceTimer.addElapsed(other.reduceTimer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Print the runtime profiling to logger
|
||||
*
|
||||
* @param logger
|
||||
*/
|
||||
public void log(final Logger logger) {
|
||||
log1(logger, "Input time", inputTimer);
|
||||
log1(logger, "Map time", mapTimer);
|
||||
log1(logger, "Reduce time", reduceTimer);
|
||||
log1(logger, "Outside time", outsideSchedulerTimer);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the total runtime for all functions of this nano scheduler
|
||||
*/
|
||||
//@Ensures("result >= 0.0")
|
||||
public double totalRuntimeInSeconds() {
|
||||
return inputTimer.getElapsedTime()
|
||||
+ mapTimer.getElapsedTime()
|
||||
+ reduceTimer.getElapsedTime()
|
||||
+ outsideSchedulerTimer.getElapsedTime();
|
||||
}
|
||||
|
||||
/**
|
||||
* Print to logger.info timing information from timer, with name label
|
||||
*
|
||||
* @param label the name of the timer to display. Should be human readable
|
||||
* @param timer the timer whose elapsed time we will display
|
||||
*/
|
||||
//@Requires({"label != null", "timer != null"})
|
||||
private void log1(final Logger logger, final String label, final SimpleTimer timer) {
|
||||
final double myTimeInSec = timer.getElapsedTime();
|
||||
final double myTimePercent = myTimeInSec / totalRuntimeInSeconds() * 100;
|
||||
logger.info(String.format("%s: %s (%5.2f%%)", label, new AutoFormattingTime(myTimeInSec), myTimePercent));
|
||||
}
|
||||
}
|
||||
|
|
@ -57,16 +57,6 @@ public class NanoScheduler<InputType, MapType, ReduceType> {
|
|||
boolean debug = false;
|
||||
private NSProgressFunction<InputType> progressFunction = null;
|
||||
|
||||
/**
|
||||
* Tracks the combined runtime profiles across all created nano schedulers
|
||||
*/
|
||||
final static private NSRuntimeProfile combinedNSRuntimeProfiler = new NSRuntimeProfile();
|
||||
|
||||
/**
|
||||
* The profile specific to this nano scheduler
|
||||
*/
|
||||
final private NSRuntimeProfile myNSRuntimeProfile = new NSRuntimeProfile();
|
||||
|
||||
/**
|
||||
* Create a new nanoscheduler with the desire characteristics requested by the argument
|
||||
*
|
||||
|
|
@ -94,9 +84,6 @@ public class NanoScheduler<InputType, MapType, ReduceType> {
|
|||
this.inputExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-input-thread-%d"));
|
||||
this.masterExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-master-thread-%d"));
|
||||
}
|
||||
|
||||
// start timing the time spent outside of the nanoScheduler
|
||||
myNSRuntimeProfile.outsideSchedulerTimer.start();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -123,11 +110,6 @@ public class NanoScheduler<InputType, MapType, ReduceType> {
|
|||
* After this call, execute cannot be invoked without throwing an error
|
||||
*/
|
||||
public void shutdown() {
|
||||
myNSRuntimeProfile.outsideSchedulerTimer.stop();
|
||||
|
||||
// add my timing information to the combined NS runtime profile
|
||||
combinedNSRuntimeProfiler.combine(myNSRuntimeProfile);
|
||||
|
||||
if ( nThreads > 1 ) {
|
||||
shutdownExecutor("inputExecutor", inputExecutor);
|
||||
shutdownExecutor("mapExecutor", mapExecutor);
|
||||
|
|
@ -137,19 +119,6 @@ public class NanoScheduler<InputType, MapType, ReduceType> {
|
|||
shutdown = true;
|
||||
}
|
||||
|
||||
public void printRuntimeProfile() {
|
||||
myNSRuntimeProfile.log(logger);
|
||||
}
|
||||
|
||||
public static void printCombinedRuntimeProfile() {
|
||||
if ( combinedNSRuntimeProfiler.totalRuntimeInSeconds() > 0.1 )
|
||||
combinedNSRuntimeProfiler.log(logger);
|
||||
}
|
||||
|
||||
protected double getTotalRuntime() {
|
||||
return myNSRuntimeProfile.totalRuntimeInSeconds();
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to cleanly shutdown an execution service, checking that the execution
|
||||
* state is clean when it's done.
|
||||
|
|
@ -245,8 +214,6 @@ public class NanoScheduler<InputType, MapType, ReduceType> {
|
|||
if ( map == null ) throw new IllegalArgumentException("map function cannot be null");
|
||||
if ( reduce == null ) throw new IllegalArgumentException("reduce function cannot be null");
|
||||
|
||||
myNSRuntimeProfile.outsideSchedulerTimer.stop();
|
||||
|
||||
ReduceType result;
|
||||
if ( ALLOW_SINGLE_THREAD_FASTPATH && getnThreads() == 1 ) {
|
||||
result = executeSingleThreaded(inputReader, map, initialValue, reduce);
|
||||
|
|
@ -254,7 +221,6 @@ public class NanoScheduler<InputType, MapType, ReduceType> {
|
|||
result = executeMultiThreaded(inputReader, map, initialValue, reduce);
|
||||
}
|
||||
|
||||
myNSRuntimeProfile.outsideSchedulerTimer.restart();
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
@ -273,28 +239,19 @@ public class NanoScheduler<InputType, MapType, ReduceType> {
|
|||
|
||||
while ( true ) {
|
||||
// start timer to ensure that both hasNext and next are caught by the timer
|
||||
myNSRuntimeProfile.inputTimer.restart();
|
||||
if ( ! inputReader.hasNext() ) {
|
||||
myNSRuntimeProfile.inputTimer.stop();
|
||||
break;
|
||||
} else {
|
||||
final InputType input = inputReader.next();
|
||||
myNSRuntimeProfile.inputTimer.stop();
|
||||
|
||||
// map
|
||||
myNSRuntimeProfile.mapTimer.restart();
|
||||
final long preMapTime = LOG_MAP_TIMES ? 0 : myNSRuntimeProfile.mapTimer.currentTimeNano();
|
||||
final MapType mapValue = map.apply(input);
|
||||
if ( LOG_MAP_TIMES ) logger.info("MAP TIME " + (myNSRuntimeProfile.mapTimer.currentTimeNano() - preMapTime));
|
||||
myNSRuntimeProfile.mapTimer.stop();
|
||||
|
||||
if ( i++ % this.bufferSize == 0 && progressFunction != null )
|
||||
if ( progressFunction != null )
|
||||
progressFunction.progress(input);
|
||||
|
||||
// reduce
|
||||
myNSRuntimeProfile.reduceTimer.restart();
|
||||
sum = reduce.apply(mapValue, sum);
|
||||
myNSRuntimeProfile.reduceTimer.stop();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -320,6 +277,7 @@ public class NanoScheduler<InputType, MapType, ReduceType> {
|
|||
while ( true ) {
|
||||
// check that no errors occurred while we were waiting
|
||||
handleErrors();
|
||||
// checkForDeadlocks();
|
||||
|
||||
try {
|
||||
final ReduceType result = reduceResult.get(100, TimeUnit.MILLISECONDS);
|
||||
|
|
@ -341,6 +299,26 @@ public class NanoScheduler<InputType, MapType, ReduceType> {
|
|||
}
|
||||
}
|
||||
|
||||
// private void checkForDeadlocks() {
|
||||
// if ( deadLockCheckCounter++ % 100 == 0 ) {
|
||||
// logger.info("Checking for deadlocks...");
|
||||
// final ThreadMXBean bean = ManagementFactory.getThreadMXBean();
|
||||
// final long[] threadIds = bean.findDeadlockedThreads(); // Returns null if no threads are deadlocked.
|
||||
//
|
||||
// if (threadIds != null) {
|
||||
// final ThreadInfo[] infos = bean.getThreadInfo(threadIds);
|
||||
//
|
||||
// logger.error("!!! Deadlock detected !!!!");
|
||||
// for (final ThreadInfo info : infos) {
|
||||
// logger.error("Thread " + info);
|
||||
// for ( final StackTraceElement elt : info.getStackTrace() ) {
|
||||
// logger.error("\t" + elt.toString());
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
private void handleErrors() {
|
||||
if ( errorTracker.hasAnErrorOccurred() ) {
|
||||
masterExecutor.shutdownNow();
|
||||
|
|
@ -380,7 +358,7 @@ public class NanoScheduler<InputType, MapType, ReduceType> {
|
|||
|
||||
// Create the input producer and start it running
|
||||
final InputProducer<InputType> inputProducer =
|
||||
new InputProducer<InputType>(inputReader, errorTracker, myNSRuntimeProfile.inputTimer, inputQueue);
|
||||
new InputProducer<InputType>(inputReader, errorTracker, inputQueue);
|
||||
inputExecutor.submit(inputProducer);
|
||||
|
||||
// a priority queue that stores up to bufferSize elements
|
||||
|
|
@ -389,7 +367,7 @@ public class NanoScheduler<InputType, MapType, ReduceType> {
|
|||
new PriorityBlockingQueue<MapResult<MapType>>();
|
||||
|
||||
final Reducer<MapType, ReduceType> reducer
|
||||
= new Reducer<MapType, ReduceType>(reduce, errorTracker, myNSRuntimeProfile.reduceTimer, initialValue);
|
||||
= new Reducer<MapType, ReduceType>(reduce, errorTracker, initialValue);
|
||||
|
||||
try {
|
||||
int nSubmittedJobs = 0;
|
||||
|
|
@ -408,7 +386,8 @@ public class NanoScheduler<InputType, MapType, ReduceType> {
|
|||
|
||||
// wait for all of the input and map threads to finish
|
||||
return waitForCompletion(inputProducer, reducer);
|
||||
} catch (Exception ex) {
|
||||
} catch (Throwable ex) {
|
||||
// logger.warn("Reduce job got exception " + ex);
|
||||
errorTracker.notifyOfError(ex);
|
||||
return initialValue;
|
||||
}
|
||||
|
|
@ -486,16 +465,12 @@ public class NanoScheduler<InputType, MapType, ReduceType> {
|
|||
final InputType input = inputWrapper.getValue();
|
||||
|
||||
// map
|
||||
myNSRuntimeProfile.mapTimer.restart();
|
||||
final long preMapTime = LOG_MAP_TIMES ? 0 : myNSRuntimeProfile.mapTimer.currentTimeNano();
|
||||
final MapType mapValue = map.apply(input);
|
||||
if ( LOG_MAP_TIMES ) logger.info("MAP TIME " + (myNSRuntimeProfile.mapTimer.currentTimeNano() - preMapTime));
|
||||
myNSRuntimeProfile.mapTimer.stop();
|
||||
|
||||
// enqueue the result into the mapResultQueue
|
||||
result = new MapResult<MapType>(mapValue, jobID);
|
||||
|
||||
if ( jobID % bufferSize == 0 && progressFunction != null )
|
||||
if ( progressFunction != null )
|
||||
progressFunction.progress(input);
|
||||
} else {
|
||||
// push back the EOF marker so other waiting threads can read it
|
||||
|
|
@ -508,7 +483,8 @@ public class NanoScheduler<InputType, MapType, ReduceType> {
|
|||
mapResultQueue.put(result);
|
||||
|
||||
final int nReduced = reducer.reduceAsMuchAsPossible(mapResultQueue);
|
||||
} catch (Exception ex) {
|
||||
} catch (Throwable ex) {
|
||||
// logger.warn("Map job got exception " + ex);
|
||||
errorTracker.notifyOfError(ex);
|
||||
} finally {
|
||||
// we finished a map job, release the job queue semaphore
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue