Merge remote-tracking branch 'unstable/master'
This commit is contained in:
commit
04cafffaa7
48
build.xml
48
build.xml
|
|
@ -1,5 +1,5 @@
|
|||
<!--
|
||||
~ Copyright (c) 2011, The Broad Institute
|
||||
~ Copyright (c) 2012, The Broad Institute
|
||||
~
|
||||
~ Permission is hereby granted, free of charge, to any person
|
||||
~ obtaining a copy of this software and associated documentation
|
||||
|
|
@ -47,6 +47,7 @@
|
|||
<property name="R.package.path" value="org/broadinstitute/sting/utils/R" />
|
||||
<property name="resource.file" value="StingText.properties" />
|
||||
<property name="resource.path" value="${java.classes}/StingText.properties" />
|
||||
<property name="key.dir" value="${public.dir}/keys" />
|
||||
|
||||
<property name="scala.public.source.dir" value="${public.dir}/scala/src" />
|
||||
<property name="scala.private.source.dir" value="${private.dir}/scala/src" />
|
||||
|
|
@ -215,7 +216,7 @@
|
|||
|
||||
<target name="git.describe">
|
||||
<exec executable="git" outputproperty="git.describe.output" resultproperty="git.describe.exit.value" failonerror="false">
|
||||
<arg line="describe" />
|
||||
<arg line="describe --long" />
|
||||
</exec>
|
||||
<condition property="git.describe.succeeded">
|
||||
<equals arg1="${git.describe.exit.value}" arg2="0" />
|
||||
|
|
@ -281,6 +282,10 @@
|
|||
<equals arg1="${gatk.target}" arg2="private" casesensitive="false" />
|
||||
</condition>
|
||||
|
||||
<condition property="include.external">
|
||||
<available file="${external.dir}"/>
|
||||
</condition>
|
||||
|
||||
<condition property="include.contracts">
|
||||
<equals arg1="${use.contracts}" arg2="true" />
|
||||
</condition>
|
||||
|
|
@ -331,7 +336,7 @@
|
|||
</javac>
|
||||
</target>
|
||||
|
||||
<target name="gatk.compile.external.source" depends="gatk.compile.public.source,gatk.compile.private.source">
|
||||
<target name="gatk.compile.external.source" depends="gatk.compile.public.source,gatk.compile.private.source" if="include.external">
|
||||
<subant target="compile" genericantfile="build.xml">
|
||||
<property name="build.dir" value="${external.build.dir}" />
|
||||
<property name="dist.dir" value="${external.dist.dir}" />
|
||||
|
|
@ -563,6 +568,7 @@
|
|||
</fileset>
|
||||
<fileset dir="${java.classes}" includes="**/commandline/**/*.class"/>
|
||||
<fileset dir="${java.classes}" includes="**/sting/pipeline/**/*.class"/>
|
||||
<fileset dir="${java.classes}" includes="**/sting/tools/**/*.class"/>
|
||||
<fileset dir="${java.classes}" includes="**/sting/jna/**/*.class"/>
|
||||
<fileset dir="${java.classes}" includes="net/sf/picard/**/*.class"/>
|
||||
<fileset dir="${java.classes}" includes="net/sf/samtools/**/*.class"/>
|
||||
|
|
@ -611,6 +617,9 @@
|
|||
<include name="**/gatk/**/*.R"/>
|
||||
<include name="**/alignment/**/*.R"/>
|
||||
</fileset>
|
||||
<fileset dir="${key.dir}">
|
||||
<include name="**/*.key"/>
|
||||
</fileset>
|
||||
<manifest>
|
||||
<attribute name="Main-Class" value="org.broadinstitute.sting.gatk.CommandLineGATK" />
|
||||
</manifest>
|
||||
|
|
@ -761,6 +770,7 @@
|
|||
<property name="java.test.classes" value="${build.dir}/java/testclasses"/>
|
||||
<property name="java.public.test.classes" value="${java.test.classes}/public"/>
|
||||
<property name="java.private.test.classes" value="${java.test.classes}/private"/>
|
||||
<property name="java.external.test.classes" value="${java.test.classes}/external"/>
|
||||
<property name="java.public.test.sources" value="${public.dir}/java/test"/>
|
||||
<property name="java.private.test.sources" value="${private.dir}/java/test"/>
|
||||
<property name="scala.test.classes" value="${build.dir}/scala/testclasses"/>
|
||||
|
|
@ -811,7 +821,23 @@
|
|||
</javac>
|
||||
</target>
|
||||
|
||||
<target name="test.java.compile" depends="test.java.public.compile, test.java.private.compile"/>
|
||||
<target name="test.java.external.compile" depends="dist,test.init.compile,test.java.public.compile" if="include.external">
|
||||
<mkdir dir="${java.external.test.classes}"/>
|
||||
<echo message="Sting: Compiling external test cases!"/>
|
||||
<javac fork="true" memoryMaximumSize="512m" destdir="${java.external.test.classes}" debug="true" optimize="on" tempdir="${java.io.tmpdir}" srcdir="${external.dir}">
|
||||
<include name="*/test/**/*.java"/>
|
||||
<classpath>
|
||||
<path refid="external.dependencies" />
|
||||
<pathelement location="${java.public.test.classes}"/>
|
||||
<pathelement location="${java.classes}"/>
|
||||
<pathelement location="${java.contracts}"/>
|
||||
<pathelement location="${testng.jar}"/>
|
||||
</classpath>
|
||||
<compilerarg value="-proc:none"/>
|
||||
</javac>
|
||||
</target>
|
||||
|
||||
<target name="test.java.compile" depends="test.java.public.compile, test.java.private.compile, test.java.external.compile"/>
|
||||
|
||||
<target name="test.scala.public.compile" depends="test.java.compile,scala.compile" if="scala.include">
|
||||
<mkdir dir="${scala.public.test.classes}"/>
|
||||
|
|
@ -852,11 +878,13 @@
|
|||
<pathelement location="${java.contracts}" />
|
||||
<pathelement location="${java.public.test.classes}" />
|
||||
<pathelement location="${java.private.test.classes}" />
|
||||
<pathelement location="${java.external.test.classes}" />
|
||||
<pathelement location="${scala.public.test.classes}" />
|
||||
<pathelement location="${scala.private.test.classes}" />
|
||||
<pathelement location="${R.tar.dir}" />
|
||||
<pathelement location="${R.public.scripts.dir}" />
|
||||
<pathelement location="${R.private.scripts.dir}" />
|
||||
<pathelement location="${key.dir}" />
|
||||
<path refid="external.dependencies" />
|
||||
</path>
|
||||
|
||||
|
|
@ -934,6 +962,9 @@
|
|||
<classfileset dir="${java.private.test.classes}" erroronmissingdir="false">
|
||||
<include name="**/@{testtype}.class" if="include.private"/>
|
||||
</classfileset>
|
||||
<classfileset dir="${java.external.test.classes}" erroronmissingdir="false">
|
||||
<include name="**/@{testtype}.class" if="include.external"/>
|
||||
</classfileset>
|
||||
<classfileset dir="${scala.public.test.classes}" erroronmissingdir="false">
|
||||
<include name="**/@{testtype}*.class" if="scala.include"/>
|
||||
</classfileset>
|
||||
|
|
@ -1210,7 +1241,14 @@
|
|||
<!-- Build gsalib R module -->
|
||||
<target name="gsalib">
|
||||
<exec executable="R" failonerror="true">
|
||||
<arg line="R CMD INSTALL -l ${R.library.dir} ${R.public.src.dir}/${R.package.path}/gsalib" />
|
||||
<arg line="R CMD INSTALL --preclean ${R.public.src.dir}/${R.package.path}/gsalib" />
|
||||
</exec>
|
||||
</target>
|
||||
|
||||
<target name="clean.gsalib">
|
||||
<!-- Currently not cleaning out the lib during 'ant clean' -->
|
||||
<exec executable="R" failonerror="false">
|
||||
<arg line="R CMD REMOVE gsalib" />
|
||||
</exec>
|
||||
</target>
|
||||
</project>
|
||||
|
|
|
|||
32
ivy.xml
32
ivy.xml
|
|
@ -1,3 +1,26 @@
|
|||
<!--
|
||||
~ Copyright (c) 2012, The Broad Institute
|
||||
~
|
||||
~ Permission is hereby granted, free of charge, to any person
|
||||
~ obtaining a copy of this software and associated documentation
|
||||
~ files (the "Software"), to deal in the Software without
|
||||
~ restriction, including without limitation the rights to use,
|
||||
~ copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
~ copies of the Software, and to permit persons to whom the
|
||||
~ Software is furnished to do so, subject to the following
|
||||
~ conditions:
|
||||
~
|
||||
~ The above copyright notice and this permission notice shall be
|
||||
~ included in all copies or substantial portions of the Software.
|
||||
~ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
~ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
~ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
~ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
~ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
~ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
~ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
~ OTHER DEALINGS IN THE SOFTWARE.
|
||||
-->
|
||||
|
||||
<ivy-module version="1.0">
|
||||
<info organisation="org.broadinstitute" module="Sting"/>
|
||||
|
|
@ -18,10 +41,9 @@
|
|||
<dependency org="log4j" name="log4j" rev="1.2.15"/>
|
||||
<dependency org="javax.mail" name="mail" rev="1.4.4"/>
|
||||
<dependency org="colt" name="colt" rev="1.2.0"/>
|
||||
<dependency org="jboss" name="javassist" rev="3.7.ga"/>
|
||||
<!-- <dependency org="jboss" name="javassist" rev="3.7.ga"/> -->
|
||||
<dependency org="org.simpleframework" name="simple-xml" rev="2.0.4"/>
|
||||
<dependency org="org.apache.bcel" name="bcel" rev="5.2"/>
|
||||
<dependency org="org.yaml" name="snakeyaml" rev="1.7"/>
|
||||
|
||||
<!-- Dependencies for reflections mvn repository -->
|
||||
<dependency org="org.reflections" name="reflections" rev="0.9.5-RC2"/>
|
||||
|
|
@ -40,17 +62,17 @@
|
|||
<dependency org="org.apache.commons" name="commons-jexl" rev="2.0"/>
|
||||
<dependency org="commons-lang" name="commons-lang" rev="2.5"/>
|
||||
<dependency org="commons-logging" name="commons-logging" rev="1.1.1"/>
|
||||
<dependency org="commons-io" name="commons-io" rev="2.0"/>
|
||||
<dependency org="commons-io" name="commons-io" rev="2.1"/>
|
||||
<dependency org="org.apache.commons" name="commons-math" rev="2.2" />
|
||||
|
||||
<!-- Lucene core utilities -->
|
||||
<dependency org="org.apache.lucene" name="lucene-core" rev="3.0.3"/>
|
||||
<!-- <dependency org="org.apache.lucene" name="lucene-core" rev="3.0.3"/> -->
|
||||
|
||||
<!-- Dependencies for LSF, DRMAA, and other C libraries -->
|
||||
<dependency org="net.java.dev.jna" name="jna" rev="3.2.7"/>
|
||||
|
||||
<!-- Dependencies for amazon.com S3 support -->
|
||||
<dependency org="net.java.dev.jets3t" name="jets3t" rev="0.8.0"/>
|
||||
<dependency org="net.java.dev.jets3t" name="jets3t" rev="0.8.1"/>
|
||||
|
||||
<!-- Dependencies for GridEngine -->
|
||||
<dependency org="net.sf.gridscheduler" name="drmaa" rev="latest.integration"/>
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
library(gsalib)
|
||||
require("ggplot2")
|
||||
require("gplots")
|
||||
library(ggplot2)
|
||||
library(gplots)
|
||||
library(tools)
|
||||
|
||||
#
|
||||
# Standard command line switch. Can we loaded interactively for development
|
||||
|
|
@ -201,4 +202,7 @@ for ( group in gatkReportData ) {
|
|||
|
||||
if ( ! is.na(outputPDF) ) {
|
||||
dev.off()
|
||||
}
|
||||
if (exists("compactPDF")) {
|
||||
compactPDF(outputPDF)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,11 +4,9 @@
|
|||
colnames(d) = tableHeader;
|
||||
|
||||
for (i in 1:ncol(d)) {
|
||||
v = suppressWarnings(as.numeric(d[,i]));
|
||||
|
||||
if (length(na.omit(as.numeric(v))) == length(d[,i])) {
|
||||
d[,i] = v;
|
||||
}
|
||||
# use the general type.convert infrastructure of read.table to convert column data to R types
|
||||
v = type.convert(d[,i])
|
||||
d[,i] = v;
|
||||
}
|
||||
|
||||
usedNames = ls(envir=tableEnv, pattern=tableName);
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
#!/bin/sh
|
||||
export BWA_HOME="/humgen/gsa-scr1/hanna/src/bwa-trunk/bwa"
|
||||
export BWA_HOME="/humgen/gsa-scr1/hanna/src/bio-bwa/bwa"
|
||||
export JAVA_INCLUDE="/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/include -I/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/include/linux"
|
||||
export TARGET_LIB="libbwa.so"
|
||||
export EXTRA_LIBS="-lc -lz -lstdc++ -lpthread"
|
||||
|
|
|
|||
|
|
@ -233,6 +233,8 @@ void BWA::set_disallow_indel_within_range(int indel_range) { options.indel_end_s
|
|||
void BWA::set_mismatch_penalty(int penalty) { options.s_mm = penalty; }
|
||||
void BWA::set_gap_open_penalty(int penalty) { options.s_gapo = penalty; }
|
||||
void BWA::set_gap_extension_penalty(int penalty) { options.s_gape = penalty; }
|
||||
void BWA::set_mode_nonstop() { options.mode |= BWA_MODE_NONSTOP; options.max_top2 = 0x7fffffff; }
|
||||
void BWA::set_max_entries_in_queue(int max_entries) { options.max_entries = max_entries; }
|
||||
|
||||
/**
|
||||
* Create a sequence with a set of reasonable initial defaults.
|
||||
|
|
|
|||
|
|
@ -60,6 +60,8 @@ class BWA {
|
|||
void set_mismatch_penalty(int penalty);
|
||||
void set_gap_open_penalty(int penalty);
|
||||
void set_gap_extension_penalty(int penalty);
|
||||
void set_mode_nonstop();
|
||||
void set_max_entries_in_queue(int max_entries);
|
||||
|
||||
// Perform the alignment
|
||||
Alignment* generate_single_alignment(const char* bases,
|
||||
|
|
|
|||
|
|
@ -8,11 +8,13 @@
|
|||
#include "bwa_gateway.h"
|
||||
#include "org_broadinstitute_sting_alignment_bwa_c_BWACAligner.h"
|
||||
|
||||
typedef void (BWA::*boolean_setter)();
|
||||
typedef void (BWA::*int_setter)(int value);
|
||||
typedef void (BWA::*float_setter)(float value);
|
||||
|
||||
static jobject convert_to_java_alignment(JNIEnv* env, const jbyte* read_bases, const jsize read_length, const Alignment& alignment);
|
||||
static jstring get_configuration_file(JNIEnv* env, jobject configuration, const char* field_name);
|
||||
static void set_boolean_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, boolean_setter setter);
|
||||
static void set_int_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, int_setter setter);
|
||||
static void set_float_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, float_setter setter);
|
||||
static void throw_config_value_exception(JNIEnv* env, const char* field_name, const char* message);
|
||||
|
|
@ -100,6 +102,10 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner
|
|||
if(env->ExceptionCheck()) return;
|
||||
set_int_configuration_param(env, configuration, "gapExtensionPenalty", bwa, &BWA::set_gap_extension_penalty);
|
||||
if(env->ExceptionCheck()) return;
|
||||
set_boolean_configuration_param(env, configuration, "nonStopMode", bwa, &BWA::set_mode_nonstop);
|
||||
if(env->ExceptionCheck()) return;
|
||||
set_int_configuration_param(env, configuration, "maxEntriesInQueue", bwa, &BWA::set_max_entries_in_queue);
|
||||
if(env->ExceptionCheck()) return;
|
||||
}
|
||||
|
||||
JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getPaths(JNIEnv *env, jobject instance, jlong java_bwa, jbyteArray java_bases)
|
||||
|
|
@ -357,6 +363,36 @@ static jstring get_configuration_file(JNIEnv* env, jobject configuration, const
|
|||
return path;
|
||||
}
|
||||
|
||||
static void set_boolean_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, boolean_setter setter) {
|
||||
jclass configuration_class = env->GetObjectClass(configuration);
|
||||
if(configuration_class == NULL) return;
|
||||
|
||||
jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/lang/Boolean;");
|
||||
if(configuration_field == NULL) return;
|
||||
|
||||
jobject boxed_value = env->GetObjectField(configuration,configuration_field);
|
||||
if(env->ExceptionCheck()) return;
|
||||
|
||||
if(boxed_value != NULL) {
|
||||
jclass boolean_box_class = env->FindClass("java/lang/Boolean");
|
||||
if(boolean_box_class == NULL) return;
|
||||
|
||||
jmethodID boolean_extractor = env->GetMethodID(boolean_box_class,"booleanValue", "()Z");
|
||||
if(boolean_extractor == NULL) return;
|
||||
|
||||
jboolean value = env->CallBooleanMethod(boxed_value,boolean_extractor);
|
||||
if(env->ExceptionCheck()) return;
|
||||
|
||||
if(value)
|
||||
(bwa->*setter)();
|
||||
|
||||
env->DeleteLocalRef(boolean_box_class);
|
||||
}
|
||||
|
||||
env->DeleteLocalRef(boxed_value);
|
||||
env->DeleteLocalRef(configuration_class);
|
||||
}
|
||||
|
||||
static void set_int_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, int_setter setter) {
|
||||
jclass configuration_class = env->GetObjectClass(configuration);
|
||||
if(configuration_class == NULL) return;
|
||||
|
|
|
|||
|
|
@ -1,247 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package net.sf.picard.sam;
|
||||
|
||||
import net.sf.picard.PicardException;
|
||||
|
||||
import java.util.*;
|
||||
import java.lang.reflect.Constructor;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
|
||||
/**
|
||||
* Provides an iterator interface for merging multiple underlying iterators into a single
|
||||
* iterable stream. The underlying iterators/files must all have the same sort order unless
|
||||
* the requested output format is unsorted, in which case any combination is valid.
|
||||
*/
|
||||
public class MergingSamRecordIterator implements CloseableIterator<SAMRecord> {
|
||||
private final PriorityQueue<ComparableSamRecordIterator> pq;
|
||||
private final SamFileHeaderMerger samHeaderMerger;
|
||||
private final Collection<SAMFileReader> readers;
|
||||
private final SAMFileHeader.SortOrder sortOrder;
|
||||
private final SAMRecordComparator comparator;
|
||||
|
||||
private boolean initialized = false;
|
||||
private boolean iterationStarted = false;
|
||||
|
||||
/**
|
||||
* Constructs a new merging iterator with the same set of readers and sort order as
|
||||
* provided by the header merger parameter.
|
||||
* @param headerMerger The merged header and contents of readers.
|
||||
* @param forcePresorted True to ensure that the iterator checks the headers of the readers for appropriate sort order.
|
||||
* @deprecated replaced by (SamFileHeaderMerger, Collection<SAMFileReader>, boolean)
|
||||
*/
|
||||
public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger, final boolean forcePresorted) {
|
||||
this(headerMerger, headerMerger.getReaders(), forcePresorted);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a new merging iterator with the same set of readers and sort order as
|
||||
* provided by the header merger parameter.
|
||||
* @param headerMerger The merged header and contents of readers.
|
||||
* @param assumeSorted false ensures that the iterator checks the headers of the readers for appropriate sort order.
|
||||
*/
|
||||
public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger, Collection<SAMFileReader> readers, final boolean assumeSorted) {
|
||||
this.samHeaderMerger = headerMerger;
|
||||
this.sortOrder = headerMerger.getMergedHeader().getSortOrder();
|
||||
this.comparator = getComparator();
|
||||
this.readers = readers;
|
||||
|
||||
this.pq = new PriorityQueue<ComparableSamRecordIterator>(readers.size());
|
||||
|
||||
for (final SAMFileReader reader : readers) {
|
||||
if (!assumeSorted && this.sortOrder != SAMFileHeader.SortOrder.unsorted &&
|
||||
reader.getFileHeader().getSortOrder() != this.sortOrder){
|
||||
throw new PicardException("Files are not compatible with sort order");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a given SAM file iterator to the merging iterator. Use this to restrict the merged iteration to a given genomic interval,
|
||||
* rather than iterating over every read in the backing file or stream.
|
||||
* @param reader Reader to add to the merging iterator.
|
||||
* @param iterator Iterator traversing over reader contents.
|
||||
*/
|
||||
public void addIterator(final SAMFileReader reader, final CloseableIterator<SAMRecord> iterator) {
|
||||
if(iterationStarted)
|
||||
throw new PicardException("Cannot add another iterator; iteration has already begun");
|
||||
if(!samHeaderMerger.containsHeader(reader.getFileHeader()))
|
||||
throw new PicardException("All iterators to be merged must be accounted for in the SAM header merger");
|
||||
final ComparableSamRecordIterator comparableIterator = new ComparableSamRecordIterator(reader,iterator,comparator);
|
||||
addIfNotEmpty(comparableIterator);
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
private void startIterationIfRequired() {
|
||||
if(initialized)
|
||||
return;
|
||||
for(SAMFileReader reader: readers)
|
||||
addIterator(reader,reader.iterator());
|
||||
iterationStarted = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close down all open iterators.
|
||||
*/
|
||||
public void close() {
|
||||
// Iterators not in the priority queue have already been closed; only close down the iterators that are still in the priority queue.
|
||||
for(CloseableIterator<SAMRecord> iterator: pq)
|
||||
iterator.close();
|
||||
}
|
||||
|
||||
/** Returns true if any of the underlying iterators has more records, otherwise false. */
|
||||
public boolean hasNext() {
|
||||
startIterationIfRequired();
|
||||
return !this.pq.isEmpty();
|
||||
}
|
||||
|
||||
/** Returns the next record from the top most iterator during merging. */
|
||||
public SAMRecord next() {
|
||||
startIterationIfRequired();
|
||||
|
||||
final ComparableSamRecordIterator iterator = this.pq.poll();
|
||||
final SAMRecord record = iterator.next();
|
||||
addIfNotEmpty(iterator);
|
||||
record.setHeader(this.samHeaderMerger.getMergedHeader());
|
||||
|
||||
// Fix the read group if needs be
|
||||
if (this.samHeaderMerger.hasReadGroupCollisions()) {
|
||||
final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.READ_GROUP_ID);
|
||||
if (oldGroupId != null ) {
|
||||
final String newGroupId = this.samHeaderMerger.getReadGroupId(iterator.getReader().getFileHeader(),oldGroupId);
|
||||
record.setAttribute(ReservedTagConstants.READ_GROUP_ID, newGroupId);
|
||||
}
|
||||
}
|
||||
|
||||
// Fix the program group if needs be
|
||||
if (this.samHeaderMerger.hasProgramGroupCollisions()) {
|
||||
final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.PROGRAM_GROUP_ID);
|
||||
if (oldGroupId != null ) {
|
||||
final String newGroupId = this.samHeaderMerger.getProgramGroupId(iterator.getReader().getFileHeader(),oldGroupId);
|
||||
record.setAttribute(ReservedTagConstants.PROGRAM_GROUP_ID, newGroupId);
|
||||
}
|
||||
}
|
||||
|
||||
// Fix up the sequence indexes if needs be
|
||||
if (this.samHeaderMerger.hasMergedSequenceDictionary()) {
|
||||
if (record.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
record.setReferenceIndex(this.samHeaderMerger.getMergedSequenceIndex(iterator.getReader().getFileHeader(),record.getReferenceIndex()));
|
||||
}
|
||||
|
||||
if (record.getReadPairedFlag() && record.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
record.setMateReferenceIndex(this.samHeaderMerger.getMergedSequenceIndex(iterator.getReader().getFileHeader(),record.getMateReferenceIndex()));
|
||||
}
|
||||
}
|
||||
|
||||
return record;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds iterator to priority queue. If the iterator has more records it is added
|
||||
* otherwise it is closed and not added.
|
||||
*/
|
||||
private void addIfNotEmpty(final ComparableSamRecordIterator iterator) {
|
||||
if (iterator.hasNext()) {
|
||||
pq.offer(iterator);
|
||||
}
|
||||
else {
|
||||
iterator.close();
|
||||
}
|
||||
}
|
||||
|
||||
/** Unsupported operation. */
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("MergingSAMRecorderIterator.remove()");
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the right comparator for a given sort order (coordinate, alphabetic). In the
|
||||
* case of "unsorted" it will return a comparator that gives an arbitrary but reflexive
|
||||
* ordering.
|
||||
*/
|
||||
private SAMRecordComparator getComparator() {
|
||||
// For unsorted build a fake comparator that compares based on object ID
|
||||
if (this.sortOrder == SAMFileHeader.SortOrder.unsorted) {
|
||||
return new SAMRecordComparator() {
|
||||
public int fileOrderCompare(final SAMRecord lhs, final SAMRecord rhs) {
|
||||
return System.identityHashCode(lhs) - System.identityHashCode(rhs);
|
||||
}
|
||||
|
||||
public int compare(final SAMRecord lhs, final SAMRecord rhs) {
|
||||
return fileOrderCompare(lhs, rhs);
|
||||
}
|
||||
};
|
||||
}
|
||||
if (samHeaderMerger.hasMergedSequenceDictionary() && sortOrder.equals(SAMFileHeader.SortOrder.coordinate)) {
|
||||
return new MergedSequenceDictionaryCoordinateOrderComparator();
|
||||
}
|
||||
|
||||
// Otherwise try and figure out what kind of comparator to return and build it
|
||||
return this.sortOrder.getComparatorInstance();
|
||||
}
|
||||
|
||||
/** Returns the merged header that the merging iterator is working from. */
|
||||
public SAMFileHeader getMergedHeader() {
|
||||
return this.samHeaderMerger.getMergedHeader();
|
||||
}
|
||||
|
||||
/**
|
||||
* Ugh. Basically does a regular coordinate compare, but looks up the sequence indices in the merged
|
||||
* sequence dictionary. I hate the fact that this extends SAMRecordCoordinateComparator, but it avoids
|
||||
* more copy & paste.
|
||||
*/
|
||||
private class MergedSequenceDictionaryCoordinateOrderComparator extends SAMRecordCoordinateComparator {
|
||||
|
||||
public int fileOrderCompare(final SAMRecord samRecord1, final SAMRecord samRecord2) {
|
||||
final int referenceIndex1 = getReferenceIndex(samRecord1);
|
||||
final int referenceIndex2 = getReferenceIndex(samRecord2);
|
||||
if (referenceIndex1 != referenceIndex2) {
|
||||
if (referenceIndex1 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
return 1;
|
||||
} else if (referenceIndex2 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
return -1;
|
||||
} else {
|
||||
return referenceIndex1 - referenceIndex2;
|
||||
}
|
||||
}
|
||||
if (referenceIndex1 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
// Both are unmapped.
|
||||
return 0;
|
||||
}
|
||||
return samRecord1.getAlignmentStart() - samRecord2.getAlignmentStart();
|
||||
}
|
||||
|
||||
private int getReferenceIndex(final SAMRecord samRecord) {
|
||||
if (samRecord.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
return samHeaderMerger.getMergedSequenceIndex(samRecord.getHeader(), samRecord.getReferenceIndex());
|
||||
}
|
||||
if (samRecord.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
return samHeaderMerger.getMergedSequenceIndex(samRecord.getHeader(), samRecord.getMateReferenceIndex());
|
||||
}
|
||||
return SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,744 +0,0 @@
|
|||
/*
|
||||
* The MIT License
|
||||
*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
package net.sf.picard.sam;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import net.sf.picard.PicardException;
|
||||
import net.sf.samtools.AbstractSAMHeaderRecord;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMProgramRecord;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import net.sf.samtools.util.SequenceUtil;
|
||||
|
||||
/**
|
||||
* Merges SAMFileHeaders that have the same sequences into a single merged header
|
||||
* object while providing read group translation for cases where read groups
|
||||
* clash across input headers.
|
||||
*/
|
||||
public class SamFileHeaderMerger {
|
||||
//Super Header to construct
|
||||
private final SAMFileHeader mergedHeader;
|
||||
private Collection<SAMFileReader> readers;
|
||||
private final Collection<SAMFileHeader> headers;
|
||||
|
||||
//Translation of old group ids to new group ids
|
||||
private final Map<SAMFileHeader, Map<String, String>> samReadGroupIdTranslation =
|
||||
new IdentityHashMap<SAMFileHeader, Map<String, String>>();
|
||||
|
||||
//the read groups from different files use the same group ids
|
||||
private boolean hasReadGroupCollisions = false;
|
||||
|
||||
//the program records from different files use the same program record ids
|
||||
private boolean hasProgramGroupCollisions = false;
|
||||
|
||||
//Translation of old program group ids to new program group ids
|
||||
private Map<SAMFileHeader, Map<String, String>> samProgramGroupIdTranslation =
|
||||
new IdentityHashMap<SAMFileHeader, Map<String, String>>();
|
||||
|
||||
private boolean hasMergedSequenceDictionary = false;
|
||||
|
||||
// Translation of old sequence dictionary ids to new dictionary ids
|
||||
// This is an IdentityHashMap because it can be quite expensive to compute the hashCode for
|
||||
// large SAMFileHeaders. It is possible that two input files will have identical headers so that
|
||||
// the regular HashMap would fold them together, but the value stored in each of the two
|
||||
// Map entries will be the same, so it should not hurt anything.
|
||||
private final Map<SAMFileHeader, Map<Integer, Integer>> samSeqDictionaryIdTranslationViaHeader =
|
||||
new IdentityHashMap<SAMFileHeader, Map<Integer, Integer>>();
|
||||
|
||||
//HeaderRecordFactory that creates SAMReadGroupRecord instances.
|
||||
private static final HeaderRecordFactory<SAMReadGroupRecord> READ_GROUP_RECORD_FACTORY = new HeaderRecordFactory<SAMReadGroupRecord>() {
|
||||
public SAMReadGroupRecord createRecord(String id, SAMReadGroupRecord srcReadGroupRecord) {
|
||||
return new SAMReadGroupRecord(id, srcReadGroupRecord);
|
||||
}
|
||||
};
|
||||
|
||||
//HeaderRecordFactory that creates SAMProgramRecord instances.
|
||||
private static final HeaderRecordFactory<SAMProgramRecord> PROGRAM_RECORD_FACTORY = new HeaderRecordFactory<SAMProgramRecord>() {
|
||||
public SAMProgramRecord createRecord(String id, SAMProgramRecord srcProgramRecord) {
|
||||
return new SAMProgramRecord(id, srcProgramRecord);
|
||||
}
|
||||
};
|
||||
|
||||
//comparator used to sort lists of program group and read group records
|
||||
private static final Comparator<AbstractSAMHeaderRecord> RECORD_ID_COMPARATOR = new Comparator<AbstractSAMHeaderRecord>() {
|
||||
public int compare(AbstractSAMHeaderRecord o1, AbstractSAMHeaderRecord o2) {
|
||||
return o1.getId().compareTo(o2.getId());
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Create SAMFileHeader with additional information. Required that sequence dictionaries agree.
|
||||
*
|
||||
* @param readers sam file readers to combine
|
||||
* @param sortOrder sort order new header should have
|
||||
* @deprecated replaced by SamFileHeaderMerger(Collection<SAMFileHeader>, SAMFileHeader.SortOrder, boolean)
|
||||
*/
|
||||
public SamFileHeaderMerger(final Collection<SAMFileReader> readers, final SAMFileHeader.SortOrder sortOrder) {
|
||||
this(readers, sortOrder, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create SAMFileHeader with additional information.
|
||||
*
|
||||
* @param readers sam file readers to combine
|
||||
* @param sortOrder sort order new header should have
|
||||
* @param mergeDictionaries If true, merge sequence dictionaries in new header. If false, require that
|
||||
* all input sequence dictionaries be identical.
|
||||
* @deprecated replaced by SamFileHeaderMerger(Collection<SAMFileHeader>, SAMFileHeader.SortOrder, boolean)
|
||||
*/
|
||||
public SamFileHeaderMerger(final Collection<SAMFileReader> readers, final SAMFileHeader.SortOrder sortOrder, final boolean mergeDictionaries) {
|
||||
this(sortOrder, getHeadersFromReaders(readers), mergeDictionaries);
|
||||
this.readers = readers;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create SAMFileHeader with additional information.. This is the preferred constructor.
|
||||
*
|
||||
* @param sortOrder sort order new header should have
|
||||
* @param headers sam file headers to combine
|
||||
* @param mergeDictionaries If true, merge sequence dictionaries in new header. If false, require that
|
||||
* all input sequence dictionaries be identical.
|
||||
*/
|
||||
public SamFileHeaderMerger(final SAMFileHeader.SortOrder sortOrder, final Collection<SAMFileHeader> headers, final boolean mergeDictionaries) {
|
||||
this.headers = headers;
|
||||
this.mergedHeader = new SAMFileHeader();
|
||||
|
||||
SAMSequenceDictionary sequenceDictionary;
|
||||
try {
|
||||
sequenceDictionary = getSequenceDictionary(headers);
|
||||
this.hasMergedSequenceDictionary = false;
|
||||
}
|
||||
catch (SequenceUtil.SequenceListsDifferException pe) {
|
||||
if (mergeDictionaries) {
|
||||
sequenceDictionary = mergeSequenceDictionaries(headers);
|
||||
this.hasMergedSequenceDictionary = true;
|
||||
}
|
||||
else {
|
||||
throw pe;
|
||||
}
|
||||
}
|
||||
|
||||
this.mergedHeader.setSequenceDictionary(sequenceDictionary);
|
||||
|
||||
// Set program that creates input alignments
|
||||
for (final SAMProgramRecord program : mergeProgramGroups(headers)) {
|
||||
this.mergedHeader.addProgramRecord(program);
|
||||
}
|
||||
|
||||
// Set read groups for merged header
|
||||
final List<SAMReadGroupRecord> readGroups = mergeReadGroups(headers);
|
||||
this.mergedHeader.setReadGroups(readGroups);
|
||||
this.mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.none);
|
||||
|
||||
this.mergedHeader.setSortOrder(sortOrder);
|
||||
|
||||
for (final SAMFileHeader header : headers) {
|
||||
for (final String comment : header.getComments()) {
|
||||
this.mergedHeader.addComment(comment);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Utilility method to make use with old constructor
|
||||
private static List<SAMFileHeader> getHeadersFromReaders(Collection<SAMFileReader> readers) {
|
||||
List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>(readers.size());
|
||||
for (SAMFileReader reader : readers) {
|
||||
headers.add(reader.getFileHeader());
|
||||
}
|
||||
return headers;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks to see if there are clashes where different readers are using the same read
|
||||
* group IDs. If yes, then those IDs that collided are remapped.
|
||||
*
|
||||
* @param headers headers to combine
|
||||
* @return new list of read groups constructed from all the readers
|
||||
*/
|
||||
private List<SAMReadGroupRecord> mergeReadGroups(final Collection<SAMFileHeader> headers) {
|
||||
//prepare args for mergeHeaderRecords(..) call
|
||||
final HashSet<String> idsThatAreAlreadyTaken = new HashSet<String>();
|
||||
|
||||
final List<HeaderRecordAndFileHeader<SAMReadGroupRecord>> readGroupsToProcess = new LinkedList<HeaderRecordAndFileHeader<SAMReadGroupRecord>>();
|
||||
for (final SAMFileHeader header : headers) {
|
||||
for (final SAMReadGroupRecord readGroup : header.getReadGroups()) {
|
||||
//verify that there are no existing id collisions in this input file
|
||||
if(!idsThatAreAlreadyTaken.add(readGroup.getId()))
|
||||
throw new PicardException("Input file: " + header + " contains more than one RG with the same id (" + readGroup.getId() + ")");
|
||||
|
||||
readGroupsToProcess.add(new HeaderRecordAndFileHeader<SAMReadGroupRecord>(readGroup, header));
|
||||
}
|
||||
idsThatAreAlreadyTaken.clear();
|
||||
}
|
||||
|
||||
final List<SAMReadGroupRecord> result = new LinkedList<SAMReadGroupRecord>();
|
||||
|
||||
hasReadGroupCollisions = mergeHeaderRecords(readGroupsToProcess, READ_GROUP_RECORD_FACTORY, idsThatAreAlreadyTaken, samReadGroupIdTranslation, result);
|
||||
|
||||
//sort the result list by record id
|
||||
Collections.sort(result, RECORD_ID_COMPARATOR);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks to see if there are clashes where different readers are using the same program
|
||||
* group IDs. If yes, then those IDs that collided are remapped.
|
||||
*
|
||||
* @param headers headers to combine
|
||||
* @return new list of program groups constructed from all the readers
|
||||
*/
|
||||
private List<SAMProgramRecord> mergeProgramGroups(final Collection<SAMFileHeader> headers) {
|
||||
|
||||
final List<SAMProgramRecord> overallResult = new LinkedList<SAMProgramRecord>();
|
||||
|
||||
//this Set will accumulate all SAMProgramRecord ids that have been encountered so far.
|
||||
final HashSet<String> idsThatAreAlreadyTaken = new HashSet<String>();
|
||||
|
||||
//need to process all program groups
|
||||
List<HeaderRecordAndFileHeader<SAMProgramRecord>> programGroupsLeftToProcess = new LinkedList<HeaderRecordAndFileHeader<SAMProgramRecord>>();
|
||||
for (final SAMFileHeader header : headers) {
|
||||
for (final SAMProgramRecord programGroup : header.getProgramRecords()) {
|
||||
//verify that there are no existing id collisions in this input file
|
||||
if(!idsThatAreAlreadyTaken.add(programGroup.getId()))
|
||||
throw new PicardException("Input file: " + header + " contains more than one PG with the same id (" + programGroup.getId() + ")");
|
||||
|
||||
programGroupsLeftToProcess.add(new HeaderRecordAndFileHeader<SAMProgramRecord>(programGroup, header));
|
||||
}
|
||||
idsThatAreAlreadyTaken.clear();
|
||||
}
|
||||
|
||||
//A program group header (lets say ID=2 PN=B PP=1) may have a PP (previous program) attribute which chains it to
|
||||
//another program group header (lets say ID=1 PN=A) to indicate that the given file was
|
||||
//processed by program A followed by program B. These PP attributes potentially
|
||||
//connect headers into one or more tree structures. Merging is done by
|
||||
//first merging all headers that don't have PP attributes (eg. tree roots),
|
||||
//then updating and merging all headers whose PPs point to the tree-root headers,
|
||||
//and so on until all program group headers are processed.
|
||||
|
||||
//currentProgramGroups is the list of records to merge next. Start by merging the programGroups that don't have a PP attribute (eg. the tree roots).
|
||||
List< HeaderRecordAndFileHeader<SAMProgramRecord> > currentProgramGroups = new LinkedList<HeaderRecordAndFileHeader<SAMProgramRecord>>();
|
||||
for(final Iterator<HeaderRecordAndFileHeader<SAMProgramRecord>> programGroupsLeftToProcessIterator = programGroupsLeftToProcess.iterator(); programGroupsLeftToProcessIterator.hasNext(); ) {
|
||||
final HeaderRecordAndFileHeader<SAMProgramRecord> pair = programGroupsLeftToProcessIterator.next();
|
||||
if(pair.getHeaderRecord().getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG) == null) {
|
||||
programGroupsLeftToProcessIterator.remove();
|
||||
currentProgramGroups.add(pair);
|
||||
}
|
||||
}
|
||||
|
||||
//merge currentProgramGroups
|
||||
while(!currentProgramGroups.isEmpty())
|
||||
{
|
||||
final List<SAMProgramRecord> currentResult = new LinkedList<SAMProgramRecord>();
|
||||
|
||||
hasProgramGroupCollisions |= mergeHeaderRecords(currentProgramGroups, PROGRAM_RECORD_FACTORY, idsThatAreAlreadyTaken, samProgramGroupIdTranslation, currentResult);
|
||||
|
||||
//add currentResults to overallResults
|
||||
overallResult.addAll(currentResult);
|
||||
|
||||
//apply the newly-computed id translations to currentProgramGroups and programGroupsLeftToProcess
|
||||
currentProgramGroups = translateIds(currentProgramGroups, samProgramGroupIdTranslation, false);
|
||||
programGroupsLeftToProcess = translateIds(programGroupsLeftToProcess, samProgramGroupIdTranslation, true);
|
||||
|
||||
//find all records in programGroupsLeftToProcess whose ppId points to a record that was just processed (eg. a record that's in currentProgramGroups),
|
||||
//and move them to the list of programGroupsToProcessNext.
|
||||
LinkedList<HeaderRecordAndFileHeader<SAMProgramRecord>> programGroupsToProcessNext = new LinkedList<HeaderRecordAndFileHeader<SAMProgramRecord>>();
|
||||
for(final Iterator<HeaderRecordAndFileHeader<SAMProgramRecord>> programGroupsLeftToProcessIterator = programGroupsLeftToProcess.iterator(); programGroupsLeftToProcessIterator.hasNext(); ) {
|
||||
final HeaderRecordAndFileHeader<SAMProgramRecord> pairLeftToProcess = programGroupsLeftToProcessIterator.next();
|
||||
final Object ppIdOfRecordLeftToProcess = pairLeftToProcess.getHeaderRecord().getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG);
|
||||
//find what currentProgramGroups this ppId points to (NOTE: they have to come from the same file)
|
||||
for(final HeaderRecordAndFileHeader<SAMProgramRecord> justProcessedPair : currentProgramGroups) {
|
||||
String idJustProcessed = justProcessedPair.getHeaderRecord().getId();
|
||||
if(pairLeftToProcess.getFileHeader() == justProcessedPair.getFileHeader() && ppIdOfRecordLeftToProcess.equals(idJustProcessed)) {
|
||||
programGroupsLeftToProcessIterator.remove();
|
||||
programGroupsToProcessNext.add(pairLeftToProcess);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
currentProgramGroups = programGroupsToProcessNext;
|
||||
}
|
||||
|
||||
//verify that all records were processed
|
||||
if(!programGroupsLeftToProcess.isEmpty()) {
|
||||
StringBuffer errorMsg = new StringBuffer(programGroupsLeftToProcess.size() + " program groups weren't processed. Do their PP ids point to existing PGs? \n");
|
||||
for( final HeaderRecordAndFileHeader<SAMProgramRecord> pair : programGroupsLeftToProcess ) {
|
||||
SAMProgramRecord record = pair.getHeaderRecord();
|
||||
errorMsg.append("@PG ID:"+record.getProgramGroupId()+" PN:"+record.getProgramName()+" PP:"+record.getPreviousProgramGroupId() +"\n");
|
||||
}
|
||||
throw new PicardException(errorMsg.toString());
|
||||
}
|
||||
|
||||
//sort the result list by record id
|
||||
Collections.sort(overallResult, RECORD_ID_COMPARATOR);
|
||||
|
||||
return overallResult;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Utility method that takes a list of program groups and remaps all their
|
||||
* ids (including ppIds if requested) using the given idTranslationTable.
|
||||
*
|
||||
* NOTE: when remapping, this method creates new SAMProgramRecords and
|
||||
* doesn't mutate any records in the programGroups list.
|
||||
*
|
||||
* @param programGroups The program groups to translate.
|
||||
* @param idTranslationTable The translation table.
|
||||
* @param translatePpIds Whether ppIds should be translated as well.
|
||||
*
|
||||
* @return The list of translated records.
|
||||
*/
|
||||
private List<HeaderRecordAndFileHeader<SAMProgramRecord>> translateIds(
|
||||
List<HeaderRecordAndFileHeader<SAMProgramRecord>> programGroups,
|
||||
Map<SAMFileHeader, Map<String, String>> idTranslationTable,
|
||||
boolean translatePpIds) {
|
||||
|
||||
//go through programGroups and translate any IDs and PPs based on the idTranslationTable.
|
||||
List<HeaderRecordAndFileHeader<SAMProgramRecord>> result = new LinkedList<HeaderRecordAndFileHeader<SAMProgramRecord>>();
|
||||
for(final HeaderRecordAndFileHeader<SAMProgramRecord> pair : programGroups ) {
|
||||
final SAMProgramRecord record = pair.getHeaderRecord();
|
||||
final String id = record.getProgramGroupId();
|
||||
final String ppId = (String) record.getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG);
|
||||
|
||||
final SAMFileHeader header = pair.getFileHeader();
|
||||
final Map<String, String> translations = idTranslationTable.get(header);
|
||||
|
||||
//see if one or both ids need to be translated
|
||||
SAMProgramRecord translatedRecord = null;
|
||||
if(translations != null)
|
||||
{
|
||||
String translatedId = translations.get( id );
|
||||
String translatedPpId = translatePpIds ? translations.get( ppId ) : null;
|
||||
|
||||
boolean needToTranslateId = translatedId != null && !translatedId.equals(id);
|
||||
boolean needToTranslatePpId = translatedPpId != null && !translatedPpId.equals(ppId);
|
||||
|
||||
if(needToTranslateId && needToTranslatePpId) {
|
||||
translatedRecord = new SAMProgramRecord(translatedId, record);
|
||||
translatedRecord.setAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG, translatedPpId);
|
||||
} else if(needToTranslateId) {
|
||||
translatedRecord = new SAMProgramRecord(translatedId, record);
|
||||
} else if(needToTranslatePpId) {
|
||||
translatedRecord = new SAMProgramRecord(id, record);
|
||||
translatedRecord.setAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG, translatedPpId);
|
||||
}
|
||||
}
|
||||
|
||||
if(translatedRecord != null) {
|
||||
result.add(new HeaderRecordAndFileHeader<SAMProgramRecord>(translatedRecord, header));
|
||||
} else {
|
||||
result.add(pair); //keep the original record
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Utility method for merging a List of AbstractSAMHeaderRecords. If it finds
|
||||
* records that have identical ids and attributes, it will collapse them
|
||||
* into one record. If it finds records that have identical ids but
|
||||
* non-identical attributes, this is treated as a collision. When collision happens,
|
||||
* the records' ids are remapped, and an old-id to new-id mapping is added to the idTranslationTable.
|
||||
*
|
||||
* NOTE: Non-collided records also get recorded in the idTranslationTable as
|
||||
* old-id to old-id. This way, an idTranslationTable lookup should never return null.
|
||||
*
|
||||
* @param headerRecords The header records to merge.
|
||||
* @param headerRecordFactory Constructs a specific subclass of AbstractSAMHeaderRecord.
|
||||
* @param idsThatAreAlreadyTaken If the id of a headerRecord matches an id in this set, it will be treated as a collision, and the headRecord's id will be remapped.
|
||||
* @param idTranslationTable When records collide, their ids are remapped, and an old-id to new-id
|
||||
* mapping is added to the idTranslationTable. Non-collided records also get recorded in the idTranslationTable as
|
||||
* old-id to old-id. This way, an idTranslationTable lookup should never return null.
|
||||
*
|
||||
* @param result The list of merged header records.
|
||||
*
|
||||
* @return True if there were collisions.
|
||||
*/
|
||||
private <RecordType extends AbstractSAMHeaderRecord> boolean mergeHeaderRecords(final List<HeaderRecordAndFileHeader<RecordType>> headerRecords, HeaderRecordFactory<RecordType> headerRecordFactory,
|
||||
final HashSet<String> idsThatAreAlreadyTaken, Map<SAMFileHeader, Map<String, String>> idTranslationTable, List<RecordType> result) {
|
||||
|
||||
//The outer Map bins the header records by their ids. The nested Map further collapses
|
||||
//header records which, in addition to having the same id, also have identical attributes.
|
||||
//In other words, each key in the nested map represents one or more
|
||||
//header records which have both identical ids and identical attributes. The List of
|
||||
//SAMFileHeaders keeps track of which readers these header record(s) came from.
|
||||
final Map<String, Map<RecordType, List<SAMFileHeader>>> idToRecord =
|
||||
new HashMap<String, Map<RecordType, List<SAMFileHeader>>>();
|
||||
|
||||
//Populate the idToRecord and seenIds data structures
|
||||
for (final HeaderRecordAndFileHeader<RecordType> pair : headerRecords) {
|
||||
final RecordType record = pair.getHeaderRecord();
|
||||
final SAMFileHeader header = pair.getFileHeader();
|
||||
final String recordId = record.getId();
|
||||
Map<RecordType, List<SAMFileHeader>> recordsWithSameId = idToRecord.get(recordId);
|
||||
if(recordsWithSameId == null) {
|
||||
recordsWithSameId = new LinkedHashMap<RecordType, List<SAMFileHeader>>();
|
||||
idToRecord.put(recordId, recordsWithSameId);
|
||||
}
|
||||
|
||||
List<SAMFileHeader> fileHeaders = recordsWithSameId.get(record);
|
||||
if(fileHeaders == null) {
|
||||
fileHeaders = new LinkedList<SAMFileHeader>();
|
||||
recordsWithSameId.put(record, fileHeaders);
|
||||
}
|
||||
|
||||
fileHeaders.add(header);
|
||||
}
|
||||
|
||||
//Resolve any collisions between header records by remapping their ids.
|
||||
boolean hasCollisions = false;
|
||||
for (final Map.Entry<String, Map<RecordType, List<SAMFileHeader>>> entry : idToRecord.entrySet() )
|
||||
{
|
||||
final String recordId = entry.getKey();
|
||||
final Map<RecordType, List<SAMFileHeader>> recordsWithSameId = entry.getValue();
|
||||
|
||||
|
||||
for( Map.Entry<RecordType, List<SAMFileHeader>> recordWithUniqueAttr : recordsWithSameId.entrySet()) {
|
||||
final RecordType record = recordWithUniqueAttr.getKey();
|
||||
final List<SAMFileHeader> fileHeaders = recordWithUniqueAttr.getValue();
|
||||
|
||||
String newId;
|
||||
if(!idsThatAreAlreadyTaken.contains(recordId)) {
|
||||
//don't remap 1st record. If there are more records
|
||||
//with this id, they will be remapped in the 'else'.
|
||||
newId = recordId;
|
||||
idsThatAreAlreadyTaken.add(recordId);
|
||||
} else {
|
||||
//there is more than one record with this id.
|
||||
hasCollisions = true;
|
||||
|
||||
//find a unique newId for this record
|
||||
int idx=1;
|
||||
while(idsThatAreAlreadyTaken.contains(newId = recordId + "." + Integer.toString(idx++)))
|
||||
;
|
||||
|
||||
idsThatAreAlreadyTaken.add( newId );
|
||||
}
|
||||
|
||||
for(SAMFileHeader fileHeader : fileHeaders) {
|
||||
Map<String, String> readerTranslationTable = idTranslationTable.get(fileHeader);
|
||||
if(readerTranslationTable == null) {
|
||||
readerTranslationTable = new HashMap<String, String>();
|
||||
idTranslationTable.put(fileHeader, readerTranslationTable);
|
||||
}
|
||||
readerTranslationTable.put(recordId, newId);
|
||||
}
|
||||
|
||||
result.add( headerRecordFactory.createRecord(newId, record) );
|
||||
}
|
||||
}
|
||||
|
||||
return hasCollisions;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the sequences off the SAMFileHeader. Throws runtime exception if the sequence
|
||||
* are different from one another.
|
||||
*
|
||||
* @param headers headers to pull sequences from
|
||||
* @return sequences from files. Each file should have the same sequence
|
||||
*/
|
||||
private SAMSequenceDictionary getSequenceDictionary(final Collection<SAMFileHeader> headers) {
|
||||
SAMSequenceDictionary sequences = null;
|
||||
for (final SAMFileHeader header : headers) {
|
||||
|
||||
if (sequences == null) {
|
||||
sequences = header.getSequenceDictionary();
|
||||
}
|
||||
else {
|
||||
final SAMSequenceDictionary currentSequences = header.getSequenceDictionary();
|
||||
SequenceUtil.assertSequenceDictionariesEqual(sequences, currentSequences);
|
||||
}
|
||||
}
|
||||
|
||||
return sequences;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the sequences from the SAMFileHeader, and merge the resulting sequence dictionaries.
|
||||
*
|
||||
* @param headers headers to pull sequences from
|
||||
* @return sequences from files. Each file should have the same sequence
|
||||
*/
|
||||
private SAMSequenceDictionary mergeSequenceDictionaries(final Collection<SAMFileHeader> headers) {
|
||||
SAMSequenceDictionary sequences = new SAMSequenceDictionary();
|
||||
for (final SAMFileHeader header : headers) {
|
||||
final SAMSequenceDictionary currentSequences = header.getSequenceDictionary();
|
||||
sequences = mergeSequences(sequences, currentSequences);
|
||||
}
|
||||
// second pass, make a map of the original seqeunce id -> new sequence id
|
||||
createSequenceMapping(headers, sequences);
|
||||
return sequences;
|
||||
}
|
||||
|
||||
/**
|
||||
* They've asked to merge the sequence headers. What we support right now is finding the sequence name superset.
|
||||
*
|
||||
* @param mergeIntoDict the result of merging so far. All SAMSequenceRecords in here have been cloned from the originals.
|
||||
* @param mergeFromDict A new sequence dictionary to merge into mergeIntoDict.
|
||||
* @return A new sequence dictionary that resulting from merging the two inputs.
|
||||
*/
|
||||
private SAMSequenceDictionary mergeSequences(SAMSequenceDictionary mergeIntoDict, SAMSequenceDictionary mergeFromDict) {
|
||||
|
||||
// a place to hold the sequences that we haven't found a home for, in the order the appear in mergeFromDict.
|
||||
LinkedList<SAMSequenceRecord> holder = new LinkedList<SAMSequenceRecord>();
|
||||
|
||||
// Return value will be created from this.
|
||||
LinkedList<SAMSequenceRecord> resultingDict = new LinkedList<SAMSequenceRecord>();
|
||||
for (final SAMSequenceRecord sequenceRecord : mergeIntoDict.getSequences()) {
|
||||
resultingDict.add(sequenceRecord);
|
||||
}
|
||||
|
||||
// Index into resultingDict of previous SAMSequenceRecord from mergeFromDict that already existed in mergeIntoDict.
|
||||
int prevloc = -1;
|
||||
// Previous SAMSequenceRecord from mergeFromDict that already existed in mergeIntoDict.
|
||||
SAMSequenceRecord previouslyMerged = null;
|
||||
|
||||
for (SAMSequenceRecord sequenceRecord : mergeFromDict.getSequences()) {
|
||||
// Does it already exist in resultingDict?
|
||||
int loc = getIndexOfSequenceName(resultingDict, sequenceRecord.getSequenceName());
|
||||
if (loc == -1) {
|
||||
// If doesn't already exist in resultingDict, save it an decide where to insert it later.
|
||||
holder.add(sequenceRecord.clone());
|
||||
} else if (prevloc > loc) {
|
||||
// If sequenceRecord already exists in resultingDict, but prior to the previous one
|
||||
// from mergeIntoDict that already existed, cannot merge.
|
||||
throw new PicardException("Cannot merge sequence dictionaries because sequence " +
|
||||
sequenceRecord.getSequenceName() + " and " + previouslyMerged.getSequenceName() +
|
||||
" are in different orders in two input sequence dictionaries.");
|
||||
} else {
|
||||
// Since sequenceRecord already exists in resultingDict, don't need to add it.
|
||||
// Add in all the sequences prior to it that have been held in holder.
|
||||
resultingDict.addAll(loc, holder);
|
||||
// Remember the index of sequenceRecord so can check for merge imcompatibility.
|
||||
prevloc = loc + holder.size();
|
||||
previouslyMerged = sequenceRecord;
|
||||
holder.clear();
|
||||
}
|
||||
}
|
||||
// Append anything left in holder.
|
||||
if (holder.size() != 0) {
|
||||
resultingDict.addAll(holder);
|
||||
}
|
||||
return new SAMSequenceDictionary(resultingDict);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find sequence in list.
|
||||
* @param list List to search for the sequence name.
|
||||
* @param sequenceName Name to search for.
|
||||
* @return Index of SAMSequenceRecord with the given name in list, or -1 if not found.
|
||||
*/
|
||||
private static int getIndexOfSequenceName(final List<SAMSequenceRecord> list, final String sequenceName) {
|
||||
for (int i = 0; i < list.size(); ++i) {
|
||||
if (list.get(i).getSequenceName().equals(sequenceName)) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* create the sequence mapping. This map is used to convert the unmerged header sequence ID's to the merged
|
||||
* list of sequence id's.
|
||||
* @param headers the collections of headers.
|
||||
* @param masterDictionary the superset dictionary we've created.
|
||||
*/
|
||||
private void createSequenceMapping(final Collection<SAMFileHeader> headers, SAMSequenceDictionary masterDictionary) {
|
||||
LinkedList<String> resultingDictStr = new LinkedList<String>();
|
||||
for (SAMSequenceRecord r : masterDictionary.getSequences()) {
|
||||
resultingDictStr.add(r.getSequenceName());
|
||||
}
|
||||
for (final SAMFileHeader header : headers) {
|
||||
Map<Integer, Integer> seqMap = new HashMap<Integer, Integer>();
|
||||
SAMSequenceDictionary dict = header.getSequenceDictionary();
|
||||
for (SAMSequenceRecord rec : dict.getSequences()) {
|
||||
seqMap.put(rec.getSequenceIndex(), resultingDictStr.indexOf(rec.getSequenceName()));
|
||||
}
|
||||
this.samSeqDictionaryIdTranslationViaHeader.put(header, seqMap);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Returns the read group id that should be used for the input read and RG id.
|
||||
*
|
||||
* @deprecated replaced by getReadGroupId(SAMFileHeader, String)
|
||||
* */
|
||||
public String getReadGroupId(final SAMFileReader reader, final String originalReadGroupId) {
|
||||
return getReadGroupId(reader.getFileHeader(), originalReadGroupId);
|
||||
}
|
||||
|
||||
/** Returns the read group id that should be used for the input read and RG id. */
|
||||
public String getReadGroupId(final SAMFileHeader header, final String originalReadGroupId) {
|
||||
return this.samReadGroupIdTranslation.get(header).get(originalReadGroupId);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param reader one of the input files
|
||||
* @param originalProgramGroupId a program group ID from the above input file
|
||||
* @return new ID from the merged list of program groups in the output file
|
||||
* @deprecated replaced by getProgramGroupId(SAMFileHeader, String)
|
||||
*/
|
||||
public String getProgramGroupId(final SAMFileReader reader, final String originalProgramGroupId) {
|
||||
return getProgramGroupId(reader.getFileHeader(), originalProgramGroupId);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param header one of the input headers
|
||||
* @param originalProgramGroupId a program group ID from the above input file
|
||||
* @return new ID from the merged list of program groups in the output file
|
||||
*/
|
||||
public String getProgramGroupId(final SAMFileHeader header, final String originalProgramGroupId) {
|
||||
return this.samProgramGroupIdTranslation.get(header).get(originalProgramGroupId);
|
||||
}
|
||||
|
||||
/** Returns true if there are read group duplicates within the merged headers. */
|
||||
public boolean hasReadGroupCollisions() {
|
||||
return this.hasReadGroupCollisions;
|
||||
}
|
||||
|
||||
/** Returns true if there are program group duplicates within the merged headers. */
|
||||
public boolean hasProgramGroupCollisions() {
|
||||
return hasProgramGroupCollisions;
|
||||
}
|
||||
|
||||
/** @return if we've merged the sequence dictionaries, return true */
|
||||
public boolean hasMergedSequenceDictionary() {
|
||||
return hasMergedSequenceDictionary;
|
||||
}
|
||||
|
||||
/** Returns the merged header that should be written to any output merged file. */
|
||||
public SAMFileHeader getMergedHeader() {
|
||||
return this.mergedHeader;
|
||||
}
|
||||
|
||||
/** Returns the collection of readers that this header merger is working with. May return null.
|
||||
* @deprecated replaced by getHeaders()
|
||||
*/
|
||||
public Collection<SAMFileReader> getReaders() {
|
||||
return this.readers;
|
||||
}
|
||||
|
||||
/** Returns the collection of readers that this header merger is working with.
|
||||
*/
|
||||
public Collection<SAMFileHeader> getHeaders() {
|
||||
return this.headers;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells whether this header merger contains a given SAM file header. Note that header presence
|
||||
* is confirmed / blocked by == equality, rather than actually testing SAMFileHeader.equals(), for
|
||||
* reasons of performance.
|
||||
* @param header header to check for.
|
||||
* @return True if the header exists in this HeaderMerger. False otherwise.
|
||||
*/
|
||||
boolean containsHeader(SAMFileHeader header) {
|
||||
for(SAMFileHeader headerMergerHeader: headers) {
|
||||
if(headerMergerHeader == header)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* returns the new mapping for a specified reader, given it's old sequence index
|
||||
* @param reader the reader
|
||||
* @param oldReferenceSequenceIndex the old sequence (also called reference) index
|
||||
* @return the new index value
|
||||
* @deprecated replaced by getMergedSequenceIndex(SAMFileHeader, Integer)
|
||||
*/
|
||||
public Integer getMergedSequenceIndex(SAMFileReader reader, Integer oldReferenceSequenceIndex) {
|
||||
return this.getMergedSequenceIndex(reader.getFileHeader(), oldReferenceSequenceIndex);
|
||||
}
|
||||
|
||||
/**
|
||||
* Another mechanism for getting the new sequence index, for situations in which the reader is not available.
|
||||
* Note that if the SAMRecord has already had its header replaced with the merged header, this won't work.
|
||||
* @param header The original header for the input record in question.
|
||||
* @param oldReferenceSequenceIndex The original sequence index.
|
||||
* @return the new index value that is compatible with the merged sequence index.
|
||||
*/
|
||||
public Integer getMergedSequenceIndex(final SAMFileHeader header, Integer oldReferenceSequenceIndex) {
|
||||
final Map<Integer, Integer> mapping = this.samSeqDictionaryIdTranslationViaHeader.get(header);
|
||||
if (mapping == null) {
|
||||
throw new PicardException("No sequence dictionary mapping available for header: " + header);
|
||||
}
|
||||
|
||||
final Integer newIndex = mapping.get(oldReferenceSequenceIndex);
|
||||
if (newIndex == null) {
|
||||
throw new PicardException("No mapping for reference index " + oldReferenceSequenceIndex + " from header: " + header);
|
||||
}
|
||||
|
||||
return newIndex;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Implementations of this interface are used by mergeHeaderRecords(..) to instantiate
|
||||
* specific subclasses of AbstractSAMHeaderRecord.
|
||||
*/
|
||||
private static interface HeaderRecordFactory<RecordType extends AbstractSAMHeaderRecord> {
|
||||
|
||||
/**
|
||||
* Constructs a new instance of RecordType.
|
||||
* @param id The id of the new record.
|
||||
* @param srcRecord Except for the id, the new record will be a copy of this source record.
|
||||
*/
|
||||
public RecordType createRecord(final String id, RecordType srcRecord);
|
||||
}
|
||||
|
||||
/**
|
||||
* Struct that groups together a subclass of AbstractSAMHeaderRecord with the
|
||||
* SAMFileHeader that it came from.
|
||||
*/
|
||||
private static class HeaderRecordAndFileHeader<RecordType extends AbstractSAMHeaderRecord> {
|
||||
private RecordType headerRecord;
|
||||
private SAMFileHeader samFileHeader;
|
||||
|
||||
public HeaderRecordAndFileHeader(RecordType headerRecord, SAMFileHeader samFileHeader) {
|
||||
this.headerRecord = headerRecord;
|
||||
this.samFileHeader = samFileHeader;
|
||||
}
|
||||
|
||||
public RecordType getHeaderRecord() {
|
||||
return headerRecord;
|
||||
}
|
||||
public SAMFileHeader getFileHeader() {
|
||||
return samFileHeader;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,762 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package net.sf.samtools;
|
||||
|
||||
|
||||
import net.sf.samtools.util.*;
|
||||
import net.sf.samtools.SAMFileReader.ValidationStringency;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* Internal class for reading and querying BAM files.
|
||||
*/
|
||||
class BAMFileReader extends SAMFileReader.ReaderImplementation {
|
||||
// True if reading from a File rather than an InputStream
|
||||
private boolean mIsSeekable = false;
|
||||
|
||||
// For converting bytes into other primitive types
|
||||
private BinaryCodec mStream = null;
|
||||
|
||||
// Underlying compressed data stream.
|
||||
private final BAMInputStream mInputStream;
|
||||
private SAMFileHeader mFileHeader = null;
|
||||
|
||||
// Populated if the file is seekable and an index exists
|
||||
private File mIndexFile;
|
||||
private BAMIndex mIndex = null;
|
||||
private long mFirstRecordPointer = 0;
|
||||
private CloseableIterator<SAMRecord> mCurrentIterator = null;
|
||||
|
||||
// If true, all SAMRecords are fully decoded as they are read.
|
||||
private final boolean eagerDecode;
|
||||
|
||||
// For error-checking.
|
||||
private ValidationStringency mValidationStringency;
|
||||
|
||||
// For creating BAMRecords
|
||||
private SAMRecordFactory samRecordFactory;
|
||||
|
||||
/**
|
||||
* Use the caching index reader implementation rather than the disk-hit-per-file model.
|
||||
*/
|
||||
private boolean mEnableIndexCaching = false;
|
||||
|
||||
/**
|
||||
* Use the traditional memory-mapped implementation for BAM file indexes rather than regular I/O.
|
||||
*/
|
||||
private boolean mEnableIndexMemoryMapping = true;
|
||||
|
||||
/**
|
||||
* Add information about the origin (reader and position) to SAM records.
|
||||
*/
|
||||
private SAMFileReader mFileReader = null;
|
||||
|
||||
/**
|
||||
* Prepare to read BAM from a stream (not seekable)
|
||||
* @param stream source of bytes.
|
||||
* @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
|
||||
* @param validationStringency Controls how to handle invalidate reads or header lines.
|
||||
*/
|
||||
BAMFileReader(final InputStream stream,
|
||||
final File indexFile,
|
||||
final boolean eagerDecode,
|
||||
final ValidationStringency validationStringency,
|
||||
final SAMRecordFactory factory)
|
||||
throws IOException {
|
||||
mIndexFile = indexFile;
|
||||
mIsSeekable = false;
|
||||
mInputStream = stream instanceof BAMInputStream ? (BAMInputStream)stream : new BlockCompressedInputStream(stream);
|
||||
mStream = new BinaryCodec(new DataInputStream((InputStream)mInputStream));
|
||||
this.eagerDecode = eagerDecode;
|
||||
this.mValidationStringency = validationStringency;
|
||||
this.samRecordFactory = factory;
|
||||
readHeader(null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare to read BAM from a file (seekable)
|
||||
* @param file source of bytes.
|
||||
* @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
|
||||
* @param validationStringency Controls how to handle invalidate reads or header lines.
|
||||
*/
|
||||
BAMFileReader(final File file,
|
||||
final File indexFile,
|
||||
final boolean eagerDecode,
|
||||
final ValidationStringency validationStringency,
|
||||
final SAMRecordFactory factory)
|
||||
throws IOException {
|
||||
this(new BlockCompressedInputStream(file), indexFile!=null ? indexFile : findIndexFile(file), eagerDecode, file.getAbsolutePath(), validationStringency, factory);
|
||||
if (mIndexFile != null && mIndexFile.lastModified() < file.lastModified()) {
|
||||
System.err.println("WARNING: BAM index file " + mIndexFile.getAbsolutePath() +
|
||||
" is older than BAM " + file.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
||||
BAMFileReader(final SeekableStream strm,
|
||||
final File indexFile,
|
||||
final boolean eagerDecode,
|
||||
final ValidationStringency validationStringency,
|
||||
final SAMRecordFactory factory)
|
||||
throws IOException {
|
||||
this(strm instanceof BAMInputStream ? (BAMInputStream)strm : new BlockCompressedInputStream(strm),
|
||||
indexFile,
|
||||
eagerDecode,
|
||||
strm.getSource(),
|
||||
validationStringency,
|
||||
factory);
|
||||
}
|
||||
|
||||
private BAMFileReader(final BAMInputStream inputStream,
|
||||
final File indexFile,
|
||||
final boolean eagerDecode,
|
||||
final String source,
|
||||
final ValidationStringency validationStringency,
|
||||
final SAMRecordFactory factory)
|
||||
throws IOException {
|
||||
mIndexFile = indexFile;
|
||||
mIsSeekable = true;
|
||||
mInputStream = inputStream;
|
||||
mStream = new BinaryCodec(new DataInputStream((InputStream)inputStream));
|
||||
this.eagerDecode = eagerDecode;
|
||||
this.mValidationStringency = validationStringency;
|
||||
this.samRecordFactory = factory;
|
||||
readHeader(source);
|
||||
mFirstRecordPointer = inputStream.getFilePointer();
|
||||
}
|
||||
|
||||
/**
|
||||
* If true, writes the source of every read into the source SAMRecords.
|
||||
* @param enabled true to write source information into each SAMRecord.
|
||||
*/
|
||||
void enableFileSource(final SAMFileReader reader, final boolean enabled) {
|
||||
this.mFileReader = enabled ? reader : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* If true, uses the caching version of the index reader.
|
||||
* @param enabled true to write source information into each SAMRecord.
|
||||
*/
|
||||
public void enableIndexCaching(final boolean enabled) {
|
||||
if(mIndex != null)
|
||||
throw new SAMException("Unable to turn on index caching; index file has already been loaded.");
|
||||
this.mEnableIndexCaching = enabled;
|
||||
}
|
||||
|
||||
/**
|
||||
* If false, disable the use of memory mapping for accessing index files (default behavior is to use memory mapping).
|
||||
* This is slower but more scalable when accessing large numbers of BAM files sequentially.
|
||||
* @param enabled True to use memory mapping, false to use regular I/O.
|
||||
*/
|
||||
public void enableIndexMemoryMapping(final boolean enabled) {
|
||||
if (mIndex != null) {
|
||||
throw new SAMException("Unable to change index memory mapping; index file has already been loaded.");
|
||||
}
|
||||
this.mEnableIndexMemoryMapping = enabled;
|
||||
}
|
||||
|
||||
@Override void enableCrcChecking(final boolean enabled) {
|
||||
this.mInputStream.setCheckCrcs(enabled);
|
||||
}
|
||||
|
||||
@Override void setSAMRecordFactory(final SAMRecordFactory factory) { this.samRecordFactory = factory; }
|
||||
|
||||
/**
|
||||
* @return true if ths is a BAM file, and has an index
|
||||
*/
|
||||
public boolean hasIndex() {
|
||||
return (mIndexFile != null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves the index for the given file type. Ensure that the index is of the specified type.
|
||||
* @return An index of the given type.
|
||||
*/
|
||||
public BAMIndex getIndex() {
|
||||
if(mIndexFile == null)
|
||||
throw new SAMException("No index is available for this BAM file.");
|
||||
if(mIndex == null)
|
||||
mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping)
|
||||
: new DiskBasedBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping);
|
||||
return mIndex;
|
||||
}
|
||||
|
||||
void close() {
|
||||
if (mStream != null) {
|
||||
mStream.close();
|
||||
}
|
||||
if (mIndex != null) {
|
||||
mIndex.close();
|
||||
}
|
||||
mStream = null;
|
||||
mFileHeader = null;
|
||||
mIndex = null;
|
||||
}
|
||||
|
||||
SAMFileHeader getFileHeader() {
|
||||
return mFileHeader;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set error-checking level for subsequent SAMRecord reads.
|
||||
*/
|
||||
void setValidationStringency(final SAMFileReader.ValidationStringency validationStringency) {
|
||||
this.mValidationStringency = validationStringency;
|
||||
}
|
||||
|
||||
SAMFileReader.ValidationStringency getValidationStringency() {
|
||||
return this.mValidationStringency;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare to iterate through the SAMRecords in file order.
|
||||
* Only a single iterator on a BAM file can be extant at a time. If getIterator() or a query method has been called once,
|
||||
* that iterator must be closed before getIterator() can be called again.
|
||||
* A somewhat peculiar aspect of this method is that if the file is not seekable, a second call to
|
||||
* getIterator() begins its iteration where the last one left off. That is the best that can be
|
||||
* done in that situation.
|
||||
*/
|
||||
CloseableIterator<SAMRecord> getIterator() {
|
||||
if (mStream == null) {
|
||||
throw new IllegalStateException("File reader is closed");
|
||||
}
|
||||
if (mCurrentIterator != null) {
|
||||
throw new IllegalStateException("Iteration in progress");
|
||||
}
|
||||
if (mIsSeekable) {
|
||||
try {
|
||||
mInputStream.seek(mFirstRecordPointer);
|
||||
} catch (IOException exc) {
|
||||
throw new RuntimeException(exc.getMessage(), exc);
|
||||
}
|
||||
}
|
||||
mCurrentIterator = new BAMFileIterator();
|
||||
return mCurrentIterator;
|
||||
}
|
||||
|
||||
@Override
|
||||
CloseableIterator<SAMRecord> getIterator(final SAMFileSpan chunks) {
|
||||
if (mStream == null) {
|
||||
throw new IllegalStateException("File reader is closed");
|
||||
}
|
||||
if (mCurrentIterator != null) {
|
||||
throw new IllegalStateException("Iteration in progress");
|
||||
}
|
||||
if (!(chunks instanceof BAMFileSpan)) {
|
||||
throw new IllegalStateException("BAMFileReader cannot handle this type of file span.");
|
||||
}
|
||||
|
||||
// Create an iterator over the given chunk boundaries.
|
||||
mCurrentIterator = new BAMFileIndexIterator(((BAMFileSpan)chunks).toCoordinateArray());
|
||||
return mCurrentIterator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets an unbounded pointer to the first record in the BAM file. Because the reader doesn't necessarily know
|
||||
* when the file ends, the rightmost bound of the file pointer will not end exactly where the file ends. However,
|
||||
* the rightmost bound is guaranteed to be after the last read in the file.
|
||||
* @return An unbounded pointer to the first record in the BAM file.
|
||||
*/
|
||||
@Override
|
||||
SAMFileSpan getFilePointerSpanningReads() {
|
||||
return new BAMFileSpan(new Chunk(mFirstRecordPointer,Long.MAX_VALUE));
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare to iterate through the SAMRecords that match the given interval.
|
||||
* Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
|
||||
* before calling any of the methods that return an iterator.
|
||||
*
|
||||
* Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
|
||||
* purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
|
||||
* matches the specified interval.
|
||||
*
|
||||
* Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
|
||||
* resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
|
||||
*
|
||||
* @param sequence Reference sequence sought.
|
||||
* @param start Desired SAMRecords must overlap or be contained in the interval specified by start and end.
|
||||
* A value of zero implies the start of the reference sequence.
|
||||
* @param end A value of zero implies the end of the reference sequence.
|
||||
* @param contained If true, the alignments for the SAMRecords must be completely contained in the interval
|
||||
* specified by start and end. If false, the SAMRecords need only overlap the interval.
|
||||
* @return Iterator for the matching SAMRecords
|
||||
*/
|
||||
CloseableIterator<SAMRecord> query(final String sequence, final int start, final int end, final boolean contained) {
|
||||
if (mStream == null) {
|
||||
throw new IllegalStateException("File reader is closed");
|
||||
}
|
||||
if (mCurrentIterator != null) {
|
||||
throw new IllegalStateException("Iteration in progress");
|
||||
}
|
||||
if (!mIsSeekable) {
|
||||
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
|
||||
}
|
||||
mCurrentIterator = createIndexIterator(sequence, start, end, contained? QueryType.CONTAINED: QueryType.OVERLAPPING);
|
||||
return mCurrentIterator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare to iterate through the SAMRecords with the given alignment start.
|
||||
* Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
|
||||
* before calling any of the methods that return an iterator.
|
||||
*
|
||||
* Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
|
||||
* purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
|
||||
* matches the specified interval.
|
||||
*
|
||||
* Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
|
||||
* resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
|
||||
*
|
||||
* @param sequence Reference sequence sought.
|
||||
* @param start Alignment start sought.
|
||||
* @return Iterator for the matching SAMRecords.
|
||||
*/
|
||||
CloseableIterator<SAMRecord> queryAlignmentStart(final String sequence, final int start) {
|
||||
if (mStream == null) {
|
||||
throw new IllegalStateException("File reader is closed");
|
||||
}
|
||||
if (mCurrentIterator != null) {
|
||||
throw new IllegalStateException("Iteration in progress");
|
||||
}
|
||||
if (!mIsSeekable) {
|
||||
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
|
||||
}
|
||||
mCurrentIterator = createIndexIterator(sequence, start, -1, QueryType.STARTING_AT);
|
||||
return mCurrentIterator;
|
||||
}
|
||||
|
||||
public CloseableIterator<SAMRecord> queryUnmapped() {
|
||||
if (mStream == null) {
|
||||
throw new IllegalStateException("File reader is closed");
|
||||
}
|
||||
if (mCurrentIterator != null) {
|
||||
throw new IllegalStateException("Iteration in progress");
|
||||
}
|
||||
if (!mIsSeekable) {
|
||||
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
|
||||
}
|
||||
try {
|
||||
final long startOfLastLinearBin = getIndex().getStartOfLastLinearBin();
|
||||
if (startOfLastLinearBin != -1) {
|
||||
mInputStream.seek(startOfLastLinearBin);
|
||||
} else {
|
||||
// No mapped reads in file, just start at the first read in file.
|
||||
mInputStream.seek(mFirstRecordPointer);
|
||||
}
|
||||
mCurrentIterator = new BAMFileIndexUnmappedIterator();
|
||||
return mCurrentIterator;
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("IOException seeking to unmapped reads", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the header from the file or stream
|
||||
* @param source Note that this is used only for reporting errors.
|
||||
*/
|
||||
private void readHeader(final String source)
|
||||
throws IOException {
|
||||
|
||||
final byte[] buffer = new byte[4];
|
||||
mStream.readBytes(buffer);
|
||||
if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) {
|
||||
throw new IOException("Invalid BAM file header");
|
||||
}
|
||||
|
||||
final int headerTextLength = mStream.readInt();
|
||||
final String textHeader = mStream.readString(headerTextLength);
|
||||
final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec();
|
||||
headerCodec.setValidationStringency(mValidationStringency);
|
||||
mFileHeader = headerCodec.decode(new StringLineReader(textHeader),
|
||||
source);
|
||||
|
||||
final int sequenceCount = mStream.readInt();
|
||||
if (mFileHeader.getSequenceDictionary().size() > 0) {
|
||||
// It is allowed to have binary sequences but no text sequences, so only validate if both are present
|
||||
if (sequenceCount != mFileHeader.getSequenceDictionary().size()) {
|
||||
throw new SAMFormatException("Number of sequences in text header (" +
|
||||
mFileHeader.getSequenceDictionary().size() +
|
||||
") != number of sequences in binary header (" + sequenceCount + ") for file " + source);
|
||||
}
|
||||
for (int i = 0; i < sequenceCount; i++) {
|
||||
final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(source);
|
||||
final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i);
|
||||
if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) {
|
||||
throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " +
|
||||
source);
|
||||
}
|
||||
if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) {
|
||||
throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " +
|
||||
source);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// If only binary sequences are present, copy them into mFileHeader
|
||||
final List<SAMSequenceRecord> sequences = new ArrayList<SAMSequenceRecord>(sequenceCount);
|
||||
for (int i = 0; i < sequenceCount; i++) {
|
||||
sequences.add(readSequenceRecord(source));
|
||||
}
|
||||
mFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a single binary sequence record from the file or stream
|
||||
* @param source Note that this is used only for reporting errors.
|
||||
*/
|
||||
private SAMSequenceRecord readSequenceRecord(final String source) {
|
||||
final int nameLength = mStream.readInt();
|
||||
if (nameLength <= 1) {
|
||||
throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + source);
|
||||
}
|
||||
final String sequenceName = mStream.readString(nameLength - 1);
|
||||
// Skip the null terminator
|
||||
mStream.readByte();
|
||||
final int sequenceLength = mStream.readInt();
|
||||
return new SAMSequenceRecord(SAMSequenceRecord.truncateSequenceName(sequenceName), sequenceLength);
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterator for non-indexed sequential iteration through all SAMRecords in file.
|
||||
* Starting point of iteration is wherever current file position is when the iterator is constructed.
|
||||
*/
|
||||
private class BAMFileIterator implements CloseableIterator<SAMRecord> {
|
||||
private SAMRecord mNextRecord = null;
|
||||
private final BAMRecordCodec bamRecordCodec;
|
||||
private long samRecordIndex = 0; // Records at what position (counted in records) we are at in the file
|
||||
|
||||
BAMFileIterator() {
|
||||
this(true);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param advance Trick to enable subclass to do more setup before advancing
|
||||
*/
|
||||
BAMFileIterator(final boolean advance) {
|
||||
this.bamRecordCodec = new BAMRecordCodec(getFileHeader(), samRecordFactory);
|
||||
this.bamRecordCodec.setInputStream(BAMFileReader.this.mStream.getInputStream());
|
||||
|
||||
if (advance) {
|
||||
advance();
|
||||
}
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if (mCurrentIterator != null && this != mCurrentIterator) {
|
||||
throw new IllegalStateException("Attempt to close non-current iterator");
|
||||
}
|
||||
mCurrentIterator = null;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return (mNextRecord != null);
|
||||
}
|
||||
|
||||
public SAMRecord next() {
|
||||
final SAMRecord result = mNextRecord;
|
||||
advance();
|
||||
return result;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Not supported: remove");
|
||||
}
|
||||
|
||||
void advance() {
|
||||
try {
|
||||
mNextRecord = getNextRecord();
|
||||
|
||||
if (mNextRecord != null) {
|
||||
++this.samRecordIndex;
|
||||
// Because some decoding is done lazily, the record needs to remember the validation stringency.
|
||||
mNextRecord.setValidationStringency(mValidationStringency);
|
||||
|
||||
if (mValidationStringency != ValidationStringency.SILENT) {
|
||||
final List<SAMValidationError> validationErrors = mNextRecord.isValid();
|
||||
SAMUtils.processValidationErrors(validationErrors,
|
||||
this.samRecordIndex, BAMFileReader.this.getValidationStringency());
|
||||
}
|
||||
}
|
||||
if (eagerDecode && mNextRecord != null) {
|
||||
mNextRecord.eagerDecode();
|
||||
}
|
||||
} catch (IOException exc) {
|
||||
throw new RuntimeException(exc.getMessage(), exc);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the next record from the input stream.
|
||||
*/
|
||||
SAMRecord getNextRecord() throws IOException {
|
||||
final long startCoordinate = mInputStream.getFilePointer();
|
||||
final SAMRecord next = bamRecordCodec.decode();
|
||||
final long stopCoordinate = mInputStream.getFilePointer();
|
||||
|
||||
if(mFileReader != null && next != null)
|
||||
next.setFileSource(new SAMFileSource(mFileReader,new BAMFileSpan(new Chunk(startCoordinate,stopCoordinate))));
|
||||
|
||||
return next;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return The record that will be return by the next call to next()
|
||||
*/
|
||||
protected SAMRecord peek() {
|
||||
return mNextRecord;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare to iterate through SAMRecords matching the target interval.
|
||||
* @param sequence Desired reference sequence.
|
||||
* @param start 1-based start of target interval, inclusive.
|
||||
* @param end 1-based end of target interval, inclusive.
|
||||
* @param queryType contained, overlapping, or starting-at query.
|
||||
*/
|
||||
private CloseableIterator<SAMRecord> createIndexIterator(final String sequence,
|
||||
final int start,
|
||||
final int end,
|
||||
final QueryType queryType) {
|
||||
long[] filePointers = null;
|
||||
|
||||
// Hit the index to determine the chunk boundaries for the required data.
|
||||
final SAMFileHeader fileHeader = getFileHeader();
|
||||
final int referenceIndex = fileHeader.getSequenceIndex(sequence);
|
||||
if (referenceIndex != -1) {
|
||||
final BAMIndex fileIndex = getIndex();
|
||||
final BAMFileSpan fileSpan = fileIndex.getSpanOverlapping(referenceIndex, start, end);
|
||||
filePointers = fileSpan != null ? fileSpan.toCoordinateArray() : null;
|
||||
}
|
||||
|
||||
// Create an iterator over the above chunk boundaries.
|
||||
final BAMFileIndexIterator iterator = new BAMFileIndexIterator(filePointers);
|
||||
|
||||
// Add some preprocessing filters for edge-case reads that don't fit into this
|
||||
// query type.
|
||||
return new BAMQueryFilteringIterator(iterator,sequence,start,end,queryType);
|
||||
}
|
||||
|
||||
enum QueryType {CONTAINED, OVERLAPPING, STARTING_AT}
|
||||
|
||||
/**
|
||||
* Look for BAM index file according to standard naming convention.
|
||||
*
|
||||
* @param dataFile BAM file name.
|
||||
* @return Index file name, or null if not found.
|
||||
*/
|
||||
private static File findIndexFile(final File dataFile) {
|
||||
// If input is foo.bam, look for foo.bai
|
||||
final String bamExtension = ".bam";
|
||||
File indexFile;
|
||||
final String fileName = dataFile.getName();
|
||||
if (fileName.endsWith(bamExtension)) {
|
||||
final String bai = fileName.substring(0, fileName.length() - bamExtension.length()) + BAMIndex.BAMIndexSuffix;
|
||||
indexFile = new File(dataFile.getParent(), bai);
|
||||
if (indexFile.exists()) {
|
||||
return indexFile;
|
||||
}
|
||||
}
|
||||
|
||||
// If foo.bai doesn't exist look for foo.bam.bai
|
||||
indexFile = new File(dataFile.getParent(), dataFile.getName() + ".bai");
|
||||
if (indexFile.exists()) {
|
||||
return indexFile;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private class BAMFileIndexIterator extends BAMFileIterator {
|
||||
|
||||
private long[] mFilePointers = null;
|
||||
private int mFilePointerIndex = 0;
|
||||
private long mFilePointerLimit = -1;
|
||||
|
||||
/**
|
||||
* Prepare to iterate through SAMRecords stored in the specified compressed blocks at the given offset.
|
||||
* @param filePointers the block / offset combination, stored in chunk format.
|
||||
*/
|
||||
BAMFileIndexIterator(final long[] filePointers) {
|
||||
super(false); // delay advance() until after construction
|
||||
mFilePointers = filePointers;
|
||||
advance();
|
||||
}
|
||||
|
||||
SAMRecord getNextRecord()
|
||||
throws IOException {
|
||||
// Advance to next file block if necessary
|
||||
while (mInputStream.getFilePointer() >= mFilePointerLimit) {
|
||||
if (mFilePointers == null ||
|
||||
mFilePointerIndex >= mFilePointers.length) {
|
||||
return null;
|
||||
}
|
||||
final long startOffset = mFilePointers[mFilePointerIndex++];
|
||||
final long endOffset = mFilePointers[mFilePointerIndex++];
|
||||
mInputStream.seek(startOffset);
|
||||
mFilePointerLimit = endOffset;
|
||||
}
|
||||
// Pull next record from stream
|
||||
return super.getNextRecord();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A decorating iterator that filters out records that are outside the bounds of the
|
||||
* given query parameters.
|
||||
*/
|
||||
private class BAMQueryFilteringIterator implements CloseableIterator<SAMRecord> {
|
||||
/**
|
||||
* The wrapped iterator.
|
||||
*/
|
||||
private final CloseableIterator<SAMRecord> wrappedIterator;
|
||||
|
||||
/**
|
||||
* The next record to be returned. Will be null if no such record exists.
|
||||
*/
|
||||
private SAMRecord mNextRecord;
|
||||
|
||||
private final int mReferenceIndex;
|
||||
private final int mRegionStart;
|
||||
private final int mRegionEnd;
|
||||
private final QueryType mQueryType;
|
||||
|
||||
public BAMQueryFilteringIterator(final CloseableIterator<SAMRecord> iterator,final String sequence, final int start, final int end, final QueryType queryType) {
|
||||
this.wrappedIterator = iterator;
|
||||
final SAMFileHeader fileHeader = getFileHeader();
|
||||
mReferenceIndex = fileHeader.getSequenceIndex(sequence);
|
||||
mRegionStart = start;
|
||||
if (queryType == QueryType.STARTING_AT) {
|
||||
mRegionEnd = mRegionStart;
|
||||
} else {
|
||||
mRegionEnd = (end <= 0) ? Integer.MAX_VALUE : end;
|
||||
}
|
||||
mQueryType = queryType;
|
||||
mNextRecord = advance();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if a next element exists; false otherwise.
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
return mNextRecord != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the next record from the given iterator.
|
||||
* @return The next SAM record in the iterator.
|
||||
*/
|
||||
public SAMRecord next() {
|
||||
if(!hasNext())
|
||||
throw new NoSuchElementException("BAMQueryFilteringIterator: no next element available");
|
||||
final SAMRecord currentRead = mNextRecord;
|
||||
mNextRecord = advance();
|
||||
return currentRead;
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes down the existing iterator.
|
||||
*/
|
||||
public void close() {
|
||||
if (this != mCurrentIterator) {
|
||||
throw new IllegalStateException("Attempt to close non-current iterator");
|
||||
}
|
||||
mCurrentIterator = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws UnsupportedOperationException always.
|
||||
*/
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Not supported: remove");
|
||||
}
|
||||
|
||||
SAMRecord advance() {
|
||||
while (true) {
|
||||
// Pull next record from stream
|
||||
if(!wrappedIterator.hasNext())
|
||||
return null;
|
||||
|
||||
final SAMRecord record = wrappedIterator.next();
|
||||
// If beyond the end of this reference sequence, end iteration
|
||||
final int referenceIndex = record.getReferenceIndex();
|
||||
if (referenceIndex != mReferenceIndex) {
|
||||
if (referenceIndex < 0 ||
|
||||
referenceIndex > mReferenceIndex) {
|
||||
return null;
|
||||
}
|
||||
// If before this reference sequence, continue
|
||||
continue;
|
||||
}
|
||||
if (mRegionStart == 0 && mRegionEnd == Integer.MAX_VALUE) {
|
||||
// Quick exit to avoid expensive alignment end calculation
|
||||
return record;
|
||||
}
|
||||
final int alignmentStart = record.getAlignmentStart();
|
||||
// If read is unmapped but has a coordinate, return it if the coordinate is within
|
||||
// the query region, regardless of whether the mapped mate will be returned.
|
||||
final int alignmentEnd;
|
||||
if (mQueryType == QueryType.STARTING_AT) {
|
||||
alignmentEnd = -1;
|
||||
} else {
|
||||
alignmentEnd = (record.getAlignmentEnd() != SAMRecord.NO_ALIGNMENT_START?
|
||||
record.getAlignmentEnd(): alignmentStart);
|
||||
}
|
||||
|
||||
if (alignmentStart > mRegionEnd) {
|
||||
// If scanned beyond target region, end iteration
|
||||
return null;
|
||||
}
|
||||
// Filter for overlap with region
|
||||
if (mQueryType == QueryType.CONTAINED) {
|
||||
if (alignmentStart >= mRegionStart && alignmentEnd <= mRegionEnd) {
|
||||
return record;
|
||||
}
|
||||
} else if (mQueryType == QueryType.OVERLAPPING) {
|
||||
if (alignmentEnd >= mRegionStart && alignmentStart <= mRegionEnd) {
|
||||
return record;
|
||||
}
|
||||
} else {
|
||||
if (alignmentStart == mRegionStart) {
|
||||
return record;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class BAMFileIndexUnmappedIterator extends BAMFileIterator {
|
||||
private BAMFileIndexUnmappedIterator() {
|
||||
while (this.hasNext() && peek().getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
advance();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -40,6 +40,10 @@ public class GATKChunk extends Chunk {
|
|||
super(start,stop);
|
||||
}
|
||||
|
||||
public GATKChunk(final long blockStart, final int blockOffsetStart, final long blockEnd, final int blockOffsetEnd) {
|
||||
super(blockStart << 16 | blockOffsetStart,blockEnd << 16 | blockOffsetEnd);
|
||||
}
|
||||
|
||||
public GATKChunk(final Chunk chunk) {
|
||||
super(chunk.getChunkStart(),chunk.getChunkEnd());
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
|
|
@ -22,15 +22,18 @@
|
|||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.queue.extensions.gatk
|
||||
|
||||
import org.broadinstitute.sting.queue.function.scattergather.GatherFunction
|
||||
import org.broadinstitute.sting.queue.function.InProcessFunction
|
||||
package net.sf.samtools;
|
||||
|
||||
/**
|
||||
* A no-op for index files that were automatically generated during the gather step.
|
||||
* TODO: Allow graph to know that this isn't needed, and/or that one gather job can actually gather N-outputs, and/or look more into generic source->sinks.
|
||||
* Utils that insist on being in the same package as Picard.
|
||||
*/
|
||||
class AutoIndexGatherFunction extends InProcessFunction with GatherFunction {
|
||||
def run() {}
|
||||
public class PicardNamespaceUtils {
|
||||
/**
|
||||
* Private constructor only. Do not instantiate.
|
||||
*/
|
||||
private PicardNamespaceUtils() {}
|
||||
|
||||
public static void setFileSource(final SAMRecord read, final SAMFileSource fileSource) {
|
||||
read.setFileSource(fileSource);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,72 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package net.sf.samtools.util;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* An input stream formulated for use reading BAM files. Supports
|
||||
*/
|
||||
public interface BAMInputStream {
|
||||
/**
|
||||
* Seek to the given position in the file. Note that pos is a special virtual file pointer,
|
||||
* not an actual byte offset.
|
||||
*
|
||||
* @param pos virtual file pointer
|
||||
*/
|
||||
public void seek(final long pos) throws IOException;
|
||||
|
||||
/**
|
||||
* @return virtual file pointer that can be passed to seek() to return to the current position. This is
|
||||
* not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
|
||||
* the two.
|
||||
*/
|
||||
public long getFilePointer();
|
||||
|
||||
/**
|
||||
* Determines whether or not the inflater will re-calculated the CRC on the decompressed data
|
||||
* and check it against the value stored in the GZIP header. CRC checking is an expensive
|
||||
* operation and should be used accordingly.
|
||||
*/
|
||||
public void setCheckCrcs(final boolean check);
|
||||
|
||||
public int read() throws java.io.IOException;
|
||||
|
||||
public int read(byte[] bytes) throws java.io.IOException;
|
||||
|
||||
public int read(byte[] bytes, int i, int i1) throws java.io.IOException;
|
||||
|
||||
public long skip(long l) throws java.io.IOException;
|
||||
|
||||
public int available() throws java.io.IOException;
|
||||
|
||||
public void close() throws java.io.IOException;
|
||||
|
||||
public void mark(int i);
|
||||
|
||||
public void reset() throws java.io.IOException;
|
||||
|
||||
public boolean markSupported();
|
||||
}
|
||||
|
|
@ -1,483 +0,0 @@
|
|||
/*
|
||||
* The MIT License
|
||||
*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
package net.sf.samtools.util;
|
||||
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.net.URL;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.util.Arrays;
|
||||
|
||||
import net.sf.samtools.FileTruncatedException;
|
||||
|
||||
/*
|
||||
* Utility class for reading BGZF block compressed files. The caller can treat this file like any other InputStream.
|
||||
* It probably is not necessary to wrap this stream in a buffering stream, because there is internal buffering.
|
||||
* The advantage of BGZF over conventional GZip format is that BGZF allows for seeking without having to read the
|
||||
* entire file up to the location being sought. Note that seeking is only possible if the ctor(File) is used.
|
||||
*
|
||||
* c.f. http://samtools.sourceforge.net/SAM1.pdf for details of BGZF format
|
||||
*/
|
||||
public class BlockCompressedInputStream extends InputStream implements BAMInputStream {
|
||||
private InputStream mStream = null;
|
||||
private SeekableStream mFile = null;
|
||||
private byte[] mFileBuffer = null;
|
||||
private byte[] mCurrentBlock = null;
|
||||
private int mCurrentOffset = 0;
|
||||
private long mBlockAddress = 0;
|
||||
private int mLastBlockLength = 0;
|
||||
private final BlockGunzipper blockGunzipper = new BlockGunzipper();
|
||||
|
||||
|
||||
/**
|
||||
* Note that seek() is not supported if this ctor is used.
|
||||
*/
|
||||
public BlockCompressedInputStream(final InputStream stream) {
|
||||
mStream = IOUtil.toBufferedStream(stream);
|
||||
mFile = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Use this ctor if you wish to call seek()
|
||||
*/
|
||||
public BlockCompressedInputStream(final File file)
|
||||
throws IOException {
|
||||
mFile = new SeekableFileStream(file);
|
||||
mStream = null;
|
||||
|
||||
}
|
||||
|
||||
public BlockCompressedInputStream(final URL url) {
|
||||
mFile = new SeekableBufferedStream(new SeekableHTTPStream(url));
|
||||
mStream = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* For providing some arbitrary data source. No additional buffering is
|
||||
* provided, so if the underlying source is not buffered, wrap it in a
|
||||
* SeekableBufferedStream before passing to this ctor.
|
||||
*/
|
||||
public BlockCompressedInputStream(final SeekableStream strm) {
|
||||
mFile = strm;
|
||||
mStream = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether or not the inflater will re-calculated the CRC on the decompressed data
|
||||
* and check it against the value stored in the GZIP header. CRC checking is an expensive
|
||||
* operation and should be used accordingly.
|
||||
*/
|
||||
public void setCheckCrcs(final boolean check) {
|
||||
this.blockGunzipper.setCheckCrcs(check);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the number of bytes that can be read (or skipped over) from this input stream without blocking by the
|
||||
* next caller of a method for this input stream. The next caller might be the same thread or another thread.
|
||||
* Note that although the next caller can read this many bytes without blocking, the available() method call itself
|
||||
* may block in order to fill an internal buffer if it has been exhausted.
|
||||
*/
|
||||
public int available()
|
||||
throws IOException {
|
||||
if (mCurrentBlock == null || mCurrentOffset == mCurrentBlock.length) {
|
||||
readBlock();
|
||||
}
|
||||
if (mCurrentBlock == null) {
|
||||
return 0;
|
||||
}
|
||||
return mCurrentBlock.length - mCurrentOffset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the underlying InputStream or RandomAccessFile
|
||||
*/
|
||||
public void close()
|
||||
throws IOException {
|
||||
if (mFile != null) {
|
||||
mFile.close();
|
||||
mFile = null;
|
||||
} else if (mStream != null) {
|
||||
mStream.close();
|
||||
mStream = null;
|
||||
}
|
||||
// Encourage garbage collection
|
||||
mFileBuffer = null;
|
||||
mCurrentBlock = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the next byte of data from the input stream. The value byte is returned as an int in the range 0 to 255.
|
||||
* If no byte is available because the end of the stream has been reached, the value -1 is returned.
|
||||
* This method blocks until input data is available, the end of the stream is detected, or an exception is thrown.
|
||||
|
||||
* @return the next byte of data, or -1 if the end of the stream is reached.
|
||||
*/
|
||||
public int read()
|
||||
throws IOException {
|
||||
return (available() > 0) ? mCurrentBlock[mCurrentOffset++] : -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads some number of bytes from the input stream and stores them into the buffer array b. The number of bytes
|
||||
* actually read is returned as an integer. This method blocks until input data is available, end of file is detected,
|
||||
* or an exception is thrown.
|
||||
*
|
||||
* read(buf) has the same effect as read(buf, 0, buf.length).
|
||||
*
|
||||
* @param buffer the buffer into which the data is read.
|
||||
* @return the total number of bytes read into the buffer, or -1 is there is no more data because the end of
|
||||
* the stream has been reached.
|
||||
*/
|
||||
public int read(final byte[] buffer)
|
||||
throws IOException {
|
||||
return read(buffer, 0, buffer.length);
|
||||
}
|
||||
|
||||
private volatile ByteArrayOutputStream buf = null;
|
||||
private static final byte eol = '\n';
|
||||
private static final byte eolCr = '\r';
|
||||
|
||||
/**
|
||||
* Reads a whole line. A line is considered to be terminated by either a line feed ('\n'),
|
||||
* carriage return ('\r') or carriage return followed by a line feed ("\r\n").
|
||||
*
|
||||
* @return A String containing the contents of the line, excluding the line terminating
|
||||
* character, or null if the end of the stream has been reached
|
||||
*
|
||||
* @exception IOException If an I/O error occurs
|
||||
* @
|
||||
*/
|
||||
public String readLine() throws IOException {
|
||||
int available = available();
|
||||
if (available == 0) {
|
||||
return null;
|
||||
}
|
||||
if(null == buf){ // lazy initialisation
|
||||
buf = new ByteArrayOutputStream(8192);
|
||||
}
|
||||
buf.reset();
|
||||
boolean done = false;
|
||||
boolean foundCr = false; // \r found flag
|
||||
while (!done) {
|
||||
int linetmpPos = mCurrentOffset;
|
||||
int bCnt = 0;
|
||||
while((available-- > 0)){
|
||||
final byte c = mCurrentBlock[linetmpPos++];
|
||||
if(c == eol){ // found \n
|
||||
done = true;
|
||||
break;
|
||||
} else if(foundCr){ // previous char was \r
|
||||
--linetmpPos; // current char is not \n so put it back
|
||||
done = true;
|
||||
break;
|
||||
} else if(c == eolCr){ // found \r
|
||||
foundCr = true;
|
||||
continue; // no ++bCnt
|
||||
}
|
||||
++bCnt;
|
||||
}
|
||||
if(mCurrentOffset < linetmpPos){
|
||||
buf.write(mCurrentBlock, mCurrentOffset, bCnt);
|
||||
mCurrentOffset = linetmpPos;
|
||||
}
|
||||
available = available();
|
||||
if(available == 0){
|
||||
// EOF
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads up to len bytes of data from the input stream into an array of bytes. An attempt is made to read
|
||||
* as many as len bytes, but a smaller number may be read. The number of bytes actually read is returned as an integer.
|
||||
*
|
||||
* This method blocks until input data is available, end of file is detected, or an exception is thrown.
|
||||
*
|
||||
* @param buffer buffer into which data is read.
|
||||
* @param offset the start offset in array b at which the data is written.
|
||||
* @param length the maximum number of bytes to read.
|
||||
* @return the total number of bytes read into the buffer, or -1 if there is no more data because the end of
|
||||
* the stream has been reached.
|
||||
*/
|
||||
public int read(final byte[] buffer, int offset, int length)
|
||||
throws IOException {
|
||||
final int originalLength = length;
|
||||
while (length > 0) {
|
||||
final int available = available();
|
||||
if (available == 0) {
|
||||
// Signal EOF to caller
|
||||
if (originalLength == length) {
|
||||
return -1;
|
||||
}
|
||||
break;
|
||||
}
|
||||
final int copyLength = Math.min(length, available);
|
||||
System.arraycopy(mCurrentBlock, mCurrentOffset, buffer, offset, copyLength);
|
||||
mCurrentOffset += copyLength;
|
||||
offset += copyLength;
|
||||
length -= copyLength;
|
||||
}
|
||||
return originalLength - length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Seek to the given position in the file. Note that pos is a special virtual file pointer,
|
||||
* not an actual byte offset.
|
||||
*
|
||||
* @param pos virtual file pointer
|
||||
*/
|
||||
public void seek(final long pos)
|
||||
throws IOException {
|
||||
if (mFile == null) {
|
||||
throw new IOException("Cannot seek on stream based file");
|
||||
}
|
||||
// Decode virtual file pointer
|
||||
// Upper 48 bits is the byte offset into the compressed stream of a block.
|
||||
// Lower 16 bits is the byte offset into the uncompressed stream inside the block.
|
||||
final long compressedOffset = BlockCompressedFilePointerUtil.getBlockAddress(pos);
|
||||
final int uncompressedOffset = BlockCompressedFilePointerUtil.getBlockOffset(pos);
|
||||
final int available;
|
||||
if (mBlockAddress == compressedOffset && mCurrentBlock != null) {
|
||||
available = mCurrentBlock.length;
|
||||
} else {
|
||||
mFile.seek(compressedOffset);
|
||||
mBlockAddress = compressedOffset;
|
||||
mLastBlockLength = 0;
|
||||
readBlock();
|
||||
available = available();
|
||||
}
|
||||
if (uncompressedOffset > available ||
|
||||
(uncompressedOffset == available && !eof())) {
|
||||
throw new IOException("Invalid file pointer: " + pos);
|
||||
}
|
||||
mCurrentOffset = uncompressedOffset;
|
||||
}
|
||||
|
||||
private boolean eof() throws IOException {
|
||||
if (mFile.eof()) {
|
||||
return true;
|
||||
}
|
||||
// If the last remaining block is the size of the EMPTY_GZIP_BLOCK, this is the same as being at EOF.
|
||||
return (mFile.length() - (mBlockAddress + mLastBlockLength) == BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return virtual file pointer that can be passed to seek() to return to the current position. This is
|
||||
* not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
|
||||
* the two.
|
||||
*/
|
||||
public long getFilePointer() {
|
||||
if (mCurrentOffset == mCurrentBlock.length) {
|
||||
// If current offset is at the end of the current block, file pointer should point
|
||||
// to the beginning of the next block.
|
||||
return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress + mLastBlockLength, 0);
|
||||
}
|
||||
return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress, mCurrentOffset);
|
||||
}
|
||||
|
||||
public static long getFileBlock(final long bgzfOffset) {
|
||||
return BlockCompressedFilePointerUtil.getBlockAddress(bgzfOffset);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param stream Must be at start of file. Throws RuntimeException if !stream.markSupported().
|
||||
* @return true if the given file looks like a valid BGZF file.
|
||||
*/
|
||||
public static boolean isValidFile(final InputStream stream)
|
||||
throws IOException {
|
||||
if (!stream.markSupported()) {
|
||||
throw new RuntimeException("Cannot test non-buffered stream");
|
||||
}
|
||||
stream.mark(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
||||
final byte[] buffer = new byte[BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH];
|
||||
final int count = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
||||
stream.reset();
|
||||
return count == BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH && isValidBlockHeader(buffer);
|
||||
}
|
||||
|
||||
private static boolean isValidBlockHeader(final byte[] buffer) {
|
||||
return (buffer[0] == BlockCompressedStreamConstants.GZIP_ID1 &&
|
||||
(buffer[1] & 0xFF) == BlockCompressedStreamConstants.GZIP_ID2 &&
|
||||
(buffer[3] & BlockCompressedStreamConstants.GZIP_FLG) != 0 &&
|
||||
buffer[10] == BlockCompressedStreamConstants.GZIP_XLEN &&
|
||||
buffer[12] == BlockCompressedStreamConstants.BGZF_ID1 &&
|
||||
buffer[13] == BlockCompressedStreamConstants.BGZF_ID2);
|
||||
}
|
||||
|
||||
private void readBlock()
|
||||
throws IOException {
|
||||
|
||||
if (mFileBuffer == null) {
|
||||
mFileBuffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE];
|
||||
}
|
||||
int count = readBytes(mFileBuffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
||||
if (count == 0) {
|
||||
// Handle case where there is no empty gzip block at end.
|
||||
mCurrentOffset = 0;
|
||||
mBlockAddress += mLastBlockLength;
|
||||
mCurrentBlock = new byte[0];
|
||||
return;
|
||||
}
|
||||
if (count != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) {
|
||||
throw new IOException("Premature end of file");
|
||||
}
|
||||
final int blockLength = unpackInt16(mFileBuffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1;
|
||||
if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > mFileBuffer.length) {
|
||||
throw new IOException("Unexpected compressed block length: " + blockLength);
|
||||
}
|
||||
final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH;
|
||||
count = readBytes(mFileBuffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, remaining);
|
||||
if (count != remaining) {
|
||||
throw new FileTruncatedException("Premature end of file");
|
||||
}
|
||||
inflateBlock(mFileBuffer, blockLength);
|
||||
mCurrentOffset = 0;
|
||||
mBlockAddress += mLastBlockLength;
|
||||
mLastBlockLength = blockLength;
|
||||
}
|
||||
|
||||
private void inflateBlock(final byte[] compressedBlock, final int compressedLength)
|
||||
throws IOException {
|
||||
final int uncompressedLength = unpackInt32(compressedBlock, compressedLength-4);
|
||||
byte[] buffer = mCurrentBlock;
|
||||
mCurrentBlock = null;
|
||||
if (buffer == null || buffer.length != uncompressedLength) {
|
||||
try {
|
||||
buffer = new byte[uncompressedLength];
|
||||
} catch (NegativeArraySizeException e) {
|
||||
throw new RuntimeException("BGZF file has invalid uncompressedLength: " + uncompressedLength, e);
|
||||
}
|
||||
}
|
||||
blockGunzipper.unzipBlock(buffer, compressedBlock, compressedLength);
|
||||
mCurrentBlock = buffer;
|
||||
}
|
||||
|
||||
private int readBytes(final byte[] buffer, final int offset, final int length)
|
||||
throws IOException {
|
||||
if (mFile != null) {
|
||||
return readBytes(mFile, buffer, offset, length);
|
||||
} else if (mStream != null) {
|
||||
return readBytes(mStream, buffer, offset, length);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
private static int readBytes(final SeekableStream file, final byte[] buffer, final int offset, final int length)
|
||||
throws IOException {
|
||||
int bytesRead = 0;
|
||||
while (bytesRead < length) {
|
||||
final int count = file.read(buffer, offset + bytesRead, length - bytesRead);
|
||||
if (count <= 0) {
|
||||
break;
|
||||
}
|
||||
bytesRead += count;
|
||||
}
|
||||
return bytesRead;
|
||||
}
|
||||
|
||||
private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length)
|
||||
throws IOException {
|
||||
int bytesRead = 0;
|
||||
while (bytesRead < length) {
|
||||
final int count = stream.read(buffer, offset + bytesRead, length - bytesRead);
|
||||
if (count <= 0) {
|
||||
break;
|
||||
}
|
||||
bytesRead += count;
|
||||
}
|
||||
return bytesRead;
|
||||
}
|
||||
|
||||
private int unpackInt16(final byte[] buffer, final int offset) {
|
||||
return ((buffer[offset] & 0xFF) |
|
||||
((buffer[offset+1] & 0xFF) << 8));
|
||||
}
|
||||
|
||||
private int unpackInt32(final byte[] buffer, final int offset) {
|
||||
return ((buffer[offset] & 0xFF) |
|
||||
((buffer[offset+1] & 0xFF) << 8) |
|
||||
((buffer[offset+2] & 0xFF) << 16) |
|
||||
((buffer[offset+3] & 0xFF) << 24));
|
||||
}
|
||||
|
||||
public enum FileTermination {HAS_TERMINATOR_BLOCK, HAS_HEALTHY_LAST_BLOCK, DEFECTIVE}
|
||||
|
||||
public static FileTermination checkTermination(final File file)
|
||||
throws IOException {
|
||||
final long fileSize = file.length();
|
||||
if (fileSize < BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length) {
|
||||
return FileTermination.DEFECTIVE;
|
||||
}
|
||||
final RandomAccessFile raFile = new RandomAccessFile(file, "r");
|
||||
try {
|
||||
raFile.seek(fileSize - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
|
||||
byte[] buf = new byte[BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length];
|
||||
raFile.readFully(buf);
|
||||
if (Arrays.equals(buf, BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK)) {
|
||||
return FileTermination.HAS_TERMINATOR_BLOCK;
|
||||
}
|
||||
final int bufsize = (int)Math.min(fileSize, BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE);
|
||||
buf = new byte[bufsize];
|
||||
raFile.seek(fileSize - bufsize);
|
||||
raFile.read(buf);
|
||||
for (int i = buf.length - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length;
|
||||
i >= 0; --i) {
|
||||
if (!preambleEqual(BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE,
|
||||
buf, i, BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length)) {
|
||||
continue;
|
||||
}
|
||||
final ByteBuffer byteBuffer = ByteBuffer.wrap(buf, i + BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length, 4);
|
||||
byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
|
||||
final int totalBlockSizeMinusOne = byteBuffer.getShort() & 0xFFFF;
|
||||
if (buf.length - i == totalBlockSizeMinusOne + 1) {
|
||||
return FileTermination.HAS_HEALTHY_LAST_BLOCK;
|
||||
} else {
|
||||
return FileTermination.DEFECTIVE;
|
||||
}
|
||||
}
|
||||
return FileTermination.DEFECTIVE;
|
||||
} finally {
|
||||
raFile.close();
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean preambleEqual(final byte[] preamble, final byte[] buf, final int startOffset, final int length) {
|
||||
for (int i = 0; i < length; ++i) {
|
||||
if (preamble[i] != buf[i + startOffset]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -41,4 +41,14 @@ public class BWAConfiguration {
|
|||
* What is the scoring penalty for a gap extension?
|
||||
*/
|
||||
public Integer gapExtensionPenalty = null;
|
||||
|
||||
/**
|
||||
* Enter bwa's 'non-stop' mode (equivalent to bwa aln -N parameter).
|
||||
*/
|
||||
public Boolean nonStopMode = false;
|
||||
|
||||
/**
|
||||
* Set the max queue size that bwa will use when searching for matches (equivalent to bwa aln -m parameter).
|
||||
*/
|
||||
public Integer maxEntriesInQueue = null;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -139,11 +139,11 @@ public class AnalyzeCovariates extends CommandLineProgram {
|
|||
*/
|
||||
@Argument(fullName="max_histogram_value", shortName="maxHist", required = false, doc="If supplied, this value will be the max value of the histogram plots")
|
||||
private int MAX_HISTOGRAM_VALUE = 0;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="do_indel_quality", shortName="indels", required = false, doc="If supplied, do indel quality plotting")
|
||||
private boolean DO_INDEL_QUALITY = false;
|
||||
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
|
|
@ -274,7 +274,6 @@ public class AnalyzeCovariates extends CommandLineProgram {
|
|||
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 );
|
||||
// Add that datum to all the collapsed tables which will be used in the sequential calculation
|
||||
dataManager.addToAllTables( key, datum, IGNORE_QSCORES_LESS_THAN );
|
||||
|
||||
}
|
||||
|
||||
private void writeDataTables() {
|
||||
|
|
@ -341,7 +340,7 @@ public class AnalyzeCovariates extends CommandLineProgram {
|
|||
|
||||
// for each covariate
|
||||
for( int iii = 1; iii < requestedCovariates.size(); iii++ ) {
|
||||
Covariate cov = requestedCovariates.get(iii);
|
||||
final Covariate cov = requestedCovariates.get(iii);
|
||||
final File outputFile = new File(OUTPUT_DIR, readGroup + "." + cov.getClass().getSimpleName()+ ".dat");
|
||||
if (DO_INDEL_QUALITY) {
|
||||
RScriptExecutor executor = new RScriptExecutor();
|
||||
|
|
@ -349,7 +348,7 @@ public class AnalyzeCovariates extends CommandLineProgram {
|
|||
// The second argument is the name of the covariate in order to make the plots look nice
|
||||
executor.addArgs(outputFile, cov.getClass().getSimpleName().split("Covariate")[0]);
|
||||
executor.exec();
|
||||
} else {
|
||||
} else {
|
||||
if( iii == 1 ) {
|
||||
// Analyze reported quality
|
||||
RScriptExecutor executor = new RScriptExecutor();
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
|
|
@ -34,5 +34,6 @@ import java.lang.annotation.*;
|
|||
@Retention(RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.FIELD})
|
||||
public @interface Gather {
|
||||
Class value();
|
||||
Class value() default Gather.class;
|
||||
boolean enabled() default true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -35,9 +35,12 @@ import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor;
|
|||
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.classloader.JVMUtils;
|
||||
import org.broadinstitute.sting.utils.crypt.CryptUtils;
|
||||
import org.broadinstitute.sting.utils.crypt.GATKKey;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.text.ListFileUtils;
|
||||
|
||||
import java.security.PublicKey;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -78,6 +81,9 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
|
|||
Walker<?,?> walker = engine.getWalkerByName(getAnalysisName());
|
||||
|
||||
try {
|
||||
// Make sure a valid GATK user key is present, if required.
|
||||
authorizeGATKRun();
|
||||
|
||||
engine.setArguments(getArgumentCollection());
|
||||
|
||||
// File lists can require a bit of additional expansion. Set these explicitly by the engine.
|
||||
|
|
@ -130,6 +136,28 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
|
|||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Authorizes this run of the GATK by checking for a valid GATK user key, if required.
|
||||
* Currently, a key is required only if running with the -et NO_ET or -et STDOUT options.
|
||||
*/
|
||||
private void authorizeGATKRun() {
|
||||
if ( getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.NO_ET ||
|
||||
getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.STDOUT ) {
|
||||
if ( getArgumentCollection().gatkKeyFile == null ) {
|
||||
throw new UserException("Running with the -et NO_ET or -et STDOUT option requires a GATK Key file. " +
|
||||
"Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home " +
|
||||
"for more information and instructions on how to obtain a key.");
|
||||
}
|
||||
else {
|
||||
PublicKey gatkPublicKey = CryptUtils.loadGATKDistributedPublicKey();
|
||||
GATKKey gatkUserKey = new GATKKey(gatkPublicKey, getArgumentCollection().gatkKeyFile);
|
||||
|
||||
if ( ! gatkUserKey.isValid() ) {
|
||||
throw new UserException.KeySignatureVerificationException(getArgumentCollection().gatkKeyFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate the GATK run report for this walker using the current GATKEngine, if -et is enabled.
|
||||
|
|
|
|||
|
|
@ -25,6 +25,8 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk;
|
||||
|
||||
import net.sf.picard.PicardException;
|
||||
import net.sf.samtools.SAMException;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||
|
|
@ -95,7 +97,11 @@ public class CommandLineGATK extends CommandLineExecutable {
|
|||
// We can generate Tribble Exceptions in weird places when e.g. VCF genotype fields are
|
||||
// lazy loaded, so they aren't caught elsewhere and made into User Exceptions
|
||||
exitSystemWithUserError(e);
|
||||
} catch (net.sf.samtools.SAMException e) {
|
||||
} catch(PicardException e) {
|
||||
// TODO: Should Picard exceptions be, in general, UserExceptions or ReviewedStingExceptions?
|
||||
exitSystemWithError(e);
|
||||
}
|
||||
catch (SAMException e) {
|
||||
checkForTooManyOpenFilesProblem(e.getMessage());
|
||||
exitSystemWithSamError(e);
|
||||
} catch (Throwable t) {
|
||||
|
|
|
|||
|
|
@ -53,6 +53,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
|
@ -179,10 +180,18 @@ public class GenomeAnalysisEngine {
|
|||
*/
|
||||
private static final long GATK_RANDOM_SEED = 47382911L;
|
||||
private static Random randomGenerator = new Random(GATK_RANDOM_SEED);
|
||||
|
||||
public static Random getRandomGenerator() { return randomGenerator; }
|
||||
public static void resetRandomGenerator() { randomGenerator.setSeed(GATK_RANDOM_SEED); }
|
||||
public static void resetRandomGenerator(long seed) { randomGenerator.setSeed(seed); }
|
||||
|
||||
/**
|
||||
* Base Quality Score Recalibration helper object
|
||||
*/
|
||||
private BaseRecalibration baseRecalibration = null;
|
||||
public BaseRecalibration getBaseRecalibration() { return baseRecalibration; }
|
||||
public boolean hasBaseRecalibration() { return baseRecalibration != null; }
|
||||
public void setBaseRecalibration(File recalFile) { baseRecalibration = new BaseRecalibration(recalFile); }
|
||||
|
||||
/**
|
||||
* Actually run the GATK with the specified walker.
|
||||
*
|
||||
|
|
@ -205,6 +214,10 @@ public class GenomeAnalysisEngine {
|
|||
if (this.getArguments().nonDeterministicRandomSeed)
|
||||
resetRandomGenerator(System.currentTimeMillis());
|
||||
|
||||
// if the use specified an input BQSR recalibration table then enable on the fly recalibration
|
||||
if (this.getArguments().BQSR_RECAL_FILE != null)
|
||||
setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE);
|
||||
|
||||
// Determine how the threads should be divided between CPU vs. IO.
|
||||
determineThreadAllocation();
|
||||
|
||||
|
|
@ -224,7 +237,7 @@ public class GenomeAnalysisEngine {
|
|||
// create temp directories as necessary
|
||||
initializeTempDirectory();
|
||||
|
||||
// create the output streams "
|
||||
// create the output streams
|
||||
initializeOutputStreams(microScheduler.getOutputTracker());
|
||||
|
||||
Iterable<Shard> shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals);
|
||||
|
|
@ -450,7 +463,15 @@ public class GenomeAnalysisEngine {
|
|||
return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer());
|
||||
else
|
||||
return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer());
|
||||
}
|
||||
}
|
||||
else if(walker instanceof ActiveRegionWalker) {
|
||||
if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
|
||||
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
|
||||
if(intervals == null)
|
||||
return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer());
|
||||
else
|
||||
return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new LocusShardBalancer());
|
||||
}
|
||||
else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) {
|
||||
// Apply special validation to read pair walkers.
|
||||
if(walker instanceof ReadPairWalker) {
|
||||
|
|
@ -749,6 +770,7 @@ public class GenomeAnalysisEngine {
|
|||
getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF,
|
||||
getWalkerBAQQualityMode(),
|
||||
refReader,
|
||||
getBaseRecalibration(),
|
||||
argCollection.defaultBaseQualities);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
|||
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
|
||||
import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
|
||||
import java.util.Collection;
|
||||
/**
|
||||
|
|
@ -27,23 +28,20 @@ import java.util.Collection;
|
|||
* information about how they should be downsampled, sorted, and filtered.
|
||||
*/
|
||||
public class ReadProperties {
|
||||
private Collection<SAMReaderID> readers = null;
|
||||
private SAMFileHeader header = null;
|
||||
private SAMFileReader.ValidationStringency validationStringency = SAMFileReader.ValidationStringency.STRICT;
|
||||
private DownsamplingMethod downsamplingMethod = null;
|
||||
private ValidationExclusion exclusionList = null;
|
||||
private Collection<ReadFilter> supplementalFilters = null;
|
||||
private boolean includeReadsWithDeletionAtLoci = false;
|
||||
private boolean useOriginalBaseQualities = false;
|
||||
private boolean generateExtendedEvents = false;
|
||||
private BAQ.CalculationMode cmode = BAQ.CalculationMode.OFF;
|
||||
private BAQ.QualityMode qmode = BAQ.QualityMode.DONT_MODIFY;
|
||||
IndexedFastaSequenceFile refReader = null; // read for BAQ, if desired
|
||||
private byte defaultBaseQualities;
|
||||
|
||||
// do we want to generate additional piles of "extended" events (indels)
|
||||
// immediately after the reference base such event is associated with?
|
||||
|
||||
private final Collection<SAMReaderID> readers;
|
||||
private final SAMFileHeader header;
|
||||
private final SAMFileReader.ValidationStringency validationStringency;
|
||||
private final DownsamplingMethod downsamplingMethod;
|
||||
private final ValidationExclusion exclusionList;
|
||||
private final Collection<ReadFilter> supplementalFilters;
|
||||
private final boolean includeReadsWithDeletionAtLoci;
|
||||
private final boolean useOriginalBaseQualities;
|
||||
private final boolean generateExtendedEvents;
|
||||
private final BAQ.CalculationMode cmode;
|
||||
private final BAQ.QualityMode qmode;
|
||||
private final IndexedFastaSequenceFile refReader; // read for BAQ, if desired
|
||||
private final BaseRecalibration bqsrApplier;
|
||||
private final byte defaultBaseQualities;
|
||||
|
||||
/**
|
||||
* Return true if the walker wants to see reads that contain deletions when looking at locus pileups
|
||||
|
|
@ -126,6 +124,8 @@ public class ReadProperties {
|
|||
return refReader;
|
||||
}
|
||||
|
||||
public BaseRecalibration getBQSRApplier() { return bqsrApplier; }
|
||||
|
||||
/**
|
||||
* @return Default base quality value to fill reads missing base quality information.
|
||||
*/
|
||||
|
|
@ -165,8 +165,9 @@ public class ReadProperties {
|
|||
boolean includeReadsWithDeletionAtLoci,
|
||||
boolean generateExtendedEvents,
|
||||
BAQ.CalculationMode cmode,
|
||||
BAQ.QualityMode qmode,
|
||||
BAQ.QualityMode qmode,
|
||||
IndexedFastaSequenceFile refReader,
|
||||
BaseRecalibration bqsrApplier,
|
||||
byte defaultBaseQualities) {
|
||||
this.readers = samFiles;
|
||||
this.header = header;
|
||||
|
|
@ -180,6 +181,7 @@ public class ReadProperties {
|
|||
this.cmode = cmode;
|
||||
this.qmode = qmode;
|
||||
this.refReader = refReader;
|
||||
this.bqsrApplier = bqsrApplier;
|
||||
this.defaultBaseQualities = defaultBaseQualities;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -65,9 +65,12 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false)
|
||||
public Integer readBufferSize = null;
|
||||
|
||||
@Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? Standard is the default, can be verbose or NO_ET so nothing is posted to the run repository", required = false)
|
||||
@Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home for details.", required = false)
|
||||
public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.STANDARD;
|
||||
|
||||
@Argument(fullName = "gatk_key", shortName = "K", doc="GATK Key file. Required if running with -et NO_ET. Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home for details.", required = false)
|
||||
public File gatkKeyFile = null;
|
||||
|
||||
@Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false)
|
||||
public List<String> readFilters = new ArrayList<String>();
|
||||
|
||||
|
|
@ -75,6 +78,7 @@ public class GATKArgumentCollection {
|
|||
* Using this option one can instruct the GATK engine to traverse over only part of the genome. This argument can be specified multiple times.
|
||||
* One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals).
|
||||
* Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf).
|
||||
* To specify the completely unmapped reads in the BAM file (i.e. those without a reference contig) use -L unmapped.
|
||||
*/
|
||||
@Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
|
||||
public List<IntervalBinding<Feature>> intervals = null;
|
||||
|
|
@ -185,6 +189,15 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false)
|
||||
public Boolean useOriginalBaseQualities = false;
|
||||
|
||||
/**
|
||||
* After the header, data records occur one per line until the end of the file. The first several items on a line are the
|
||||
* values of the individual covariates and will change depending on which covariates were specified at runtime. The last
|
||||
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
|
||||
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
||||
*/
|
||||
@Input(fullName="BQSR", shortName="BQSR", required=false, doc="Filename for the input covariates table recalibration .csv file which enables on the fly base quality score recalibration")
|
||||
public File BQSR_RECAL_FILE = null; // BUGBUG: need a better argument name once we decide how BQSRs v1 and v2 will live in the code base simultaneously
|
||||
|
||||
@Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false)
|
||||
public byte defaultBaseQualities = -1;
|
||||
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ import java.util.NoSuchElementException;
|
|||
*/
|
||||
|
||||
/**
|
||||
* A LocusView over which the user can iterate.
|
||||
* A LocusView over which the user can iterate.
|
||||
*/
|
||||
|
||||
public class AllLocusView extends LocusView {
|
||||
|
|
@ -47,12 +47,13 @@ public class AllLocusView extends LocusView {
|
|||
|
||||
/**
|
||||
* Create a new queue of locus contexts.
|
||||
*
|
||||
* @param provider
|
||||
*/
|
||||
public AllLocusView(LocusShardDataProvider provider) {
|
||||
super( provider );
|
||||
public AllLocusView(LocusShardDataProvider provider) {
|
||||
super(provider);
|
||||
// Seed the state tracking members with the first possible seek position and the first possible locus context.
|
||||
locusIterator = new GenomeLocusIterator(genomeLocParser,provider.getLocus());
|
||||
locusIterator = new GenomeLocusIterator(genomeLocParser, provider.getLocus());
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
|
|
@ -63,7 +64,7 @@ public class AllLocusView extends LocusView {
|
|||
public AlignmentContext next() {
|
||||
advance();
|
||||
|
||||
if(nextPosition == null)
|
||||
if (nextPosition == null)
|
||||
throw new NoSuchElementException("No next is available in the all locus view");
|
||||
|
||||
// Flag to the iterator that no data is waiting in the queue to be processed.
|
||||
|
|
@ -72,7 +73,7 @@ public class AllLocusView extends LocusView {
|
|||
AlignmentContext currentLocus;
|
||||
|
||||
// If actual data is present, return it. Otherwise, return empty data.
|
||||
if( nextLocus != null && nextLocus.getLocation().equals(nextPosition) )
|
||||
if (nextLocus != null && nextLocus.getLocation().equals(nextPosition))
|
||||
currentLocus = nextLocus;
|
||||
else
|
||||
currentLocus = createEmptyLocus(nextPosition);
|
||||
|
|
@ -82,15 +83,15 @@ public class AllLocusView extends LocusView {
|
|||
|
||||
private void advance() {
|
||||
// Already at the next element? Don't move forward.
|
||||
if(atNextElement)
|
||||
if (atNextElement)
|
||||
return;
|
||||
|
||||
// Out of elements?
|
||||
if(nextPosition == null && !locusIterator.hasNext())
|
||||
return;
|
||||
if (nextPosition == null && !locusIterator.hasNext())
|
||||
return;
|
||||
|
||||
// If nextLocus has been consumed, clear it out to make room for the next incoming locus.
|
||||
if(nextPosition != null && nextLocus != null && !nextLocus.getLocation().isPast(nextPosition)) {
|
||||
if (nextPosition != null && nextLocus != null && !nextLocus.getLocation().isPast(nextPosition)) {
|
||||
nextLocus = null;
|
||||
|
||||
// Determine the next locus. The trick is that we may have more than one alignment context at the same
|
||||
|
|
@ -98,9 +99,9 @@ public class AllLocusView extends LocusView {
|
|||
// is still at the current position, we do not increment current position and wait for next call to next() to return
|
||||
// that context. If we know that next context is past the current position, we are done with current
|
||||
// position
|
||||
if(hasNextLocus()) {
|
||||
if (hasNextLocus()) {
|
||||
nextLocus = nextLocus();
|
||||
if(nextPosition.equals(nextLocus.getLocation())) {
|
||||
if (nextPosition.equals(nextLocus.getLocation())) {
|
||||
atNextElement = true;
|
||||
return;
|
||||
}
|
||||
|
|
@ -108,7 +109,7 @@ public class AllLocusView extends LocusView {
|
|||
}
|
||||
|
||||
// No elements left in queue? Clear out the position state tracker and return.
|
||||
if(!locusIterator.hasNext()) {
|
||||
if (!locusIterator.hasNext()) {
|
||||
nextPosition = null;
|
||||
return;
|
||||
}
|
||||
|
|
@ -119,9 +120,9 @@ public class AllLocusView extends LocusView {
|
|||
|
||||
// Crank the iterator to (if possible) or past the next context. Be careful not to hold a reference to nextLocus
|
||||
// while using the hasNextLocus() / nextLocus() machinery; this will cause us to use more memory than is optimal.
|
||||
while(nextLocus == null || nextLocus.getLocation().isBefore(nextPosition)) {
|
||||
while (nextLocus == null || nextLocus.getLocation().isBefore(nextPosition)) {
|
||||
nextLocus = null;
|
||||
if(!hasNextLocus())
|
||||
if (!hasNextLocus())
|
||||
break;
|
||||
nextLocus = nextLocus();
|
||||
}
|
||||
|
|
@ -129,12 +130,15 @@ public class AllLocusView extends LocusView {
|
|||
|
||||
/**
|
||||
* Creates a blank locus context at the specified location.
|
||||
*
|
||||
* @param site Site at which to create the blank locus context.
|
||||
* @return empty context.
|
||||
*/
|
||||
private final static List<GATKSAMRecord> EMPTY_PILEUP_READS = Collections.emptyList();
|
||||
private final static List<Integer> EMPTY_PILEUP_OFFSETS = Collections.emptyList();
|
||||
private AlignmentContext createEmptyLocus( GenomeLoc site ) {
|
||||
return new AlignmentContext(site,new ReadBackedPileupImpl(site, EMPTY_PILEUP_READS, EMPTY_PILEUP_OFFSETS));
|
||||
private final static List<Boolean> EMPTY_DELETION_STATUS = Collections.emptyList();
|
||||
|
||||
private AlignmentContext createEmptyLocus(GenomeLoc site) {
|
||||
return new AlignmentContext(site, new ReadBackedPileupImpl(site, EMPTY_PILEUP_READS, EMPTY_PILEUP_OFFSETS));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,9 +25,14 @@ import java.util.NoSuchElementException;
|
|||
*/
|
||||
|
||||
/**
|
||||
* A queue of locus context entries.
|
||||
* The two goals of the LocusView are as follows:
|
||||
* 1) To provide a 'trigger track' iteration interface so that TraverseLoci can easily switch
|
||||
* between iterating over all bases in a region, only covered bases in a region covered by
|
||||
* reads, only bases in a region covered by RODs, or any other sort of trigger track
|
||||
* implementation one can think of.
|
||||
* 2) To manage the copious number of iterators that have to be jointly pulled through the
|
||||
* genome to make a locus traversal function.
|
||||
*/
|
||||
|
||||
public abstract class LocusView extends LocusIterator implements View {
|
||||
/**
|
||||
* The locus bounding this view.
|
||||
|
|
|
|||
|
|
@ -27,8 +27,10 @@ package org.broadinstitute.sting.gatk.datasources.reads;
|
|||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import net.sf.samtools.GATKChunk;
|
||||
import net.sf.samtools.util.BlockCompressedFilePointerUtil;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
|
|
@ -38,7 +40,7 @@ import java.util.List;
|
|||
* Time: 10:47 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
class SAMReaderPosition {
|
||||
class BAMAccessPlan {
|
||||
private final SAMReaderID reader;
|
||||
private final BlockInputStream inputStream;
|
||||
|
||||
|
|
@ -51,7 +53,7 @@ class SAMReaderPosition {
|
|||
private long nextBlockAddress;
|
||||
|
||||
|
||||
SAMReaderPosition(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) {
|
||||
BAMAccessPlan(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) {
|
||||
this.reader = reader;
|
||||
this.inputStream = inputStream;
|
||||
|
||||
|
|
@ -84,11 +86,45 @@ class SAMReaderPosition {
|
|||
}
|
||||
|
||||
/**
|
||||
* Retrieves the last offset of interest in the block returned by getBlockAddress().
|
||||
* @return First block of interest in this segment.
|
||||
* Gets the spans overlapping the given block; used to copy the contents of the block into the circular buffer.
|
||||
* @param blockAddress Block address for which to search.
|
||||
* @param filePosition Block address at which to terminate the last chunk if the last chunk goes beyond this span.
|
||||
* @return list of chunks containing that block.
|
||||
*/
|
||||
public int getLastOffsetInBlock() {
|
||||
return (nextBlockAddress == positionIterator.peek().getBlockEnd()) ? positionIterator.peek().getBlockOffsetEnd() : 65536;
|
||||
public List<GATKChunk> getSpansOverlappingBlock(long blockAddress, long filePosition) {
|
||||
List<GATKChunk> spansOverlapping = new LinkedList<GATKChunk>();
|
||||
// While the position iterator overlaps the given block, pull out spans to report.
|
||||
while(positionIterator.hasNext() && positionIterator.peek().getBlockStart() <= blockAddress) {
|
||||
// Create a span over as much of the block as is covered by this chunk.
|
||||
int blockOffsetStart = (blockAddress == positionIterator.peek().getBlockStart()) ? positionIterator.peek().getBlockOffsetStart() : 0;
|
||||
|
||||
// Calculate the end of this span. If the span extends past this block, cap it using the current file position.
|
||||
long blockEnd;
|
||||
int blockOffsetEnd;
|
||||
if(blockAddress < positionIterator.peek().getBlockEnd()) {
|
||||
blockEnd = filePosition;
|
||||
blockOffsetEnd = 0;
|
||||
}
|
||||
else {
|
||||
blockEnd = positionIterator.peek().getBlockEnd();
|
||||
blockOffsetEnd = positionIterator.peek().getBlockOffsetEnd();
|
||||
}
|
||||
|
||||
GATKChunk newChunk = new GATKChunk(blockAddress,blockOffsetStart,blockEnd,blockOffsetEnd);
|
||||
|
||||
if(newChunk.getChunkStart() <= newChunk.getChunkEnd())
|
||||
spansOverlapping.add(new GATKChunk(blockAddress,blockOffsetStart,blockEnd,blockOffsetEnd));
|
||||
|
||||
// If the value currently stored in the position iterator ends past the current block, we must be done. Abort.
|
||||
if(!positionIterator.hasNext() || positionIterator.peek().getBlockEnd() > blockAddress)
|
||||
break;
|
||||
|
||||
// If the position iterator ends before the block ends, pull the position iterator forward.
|
||||
if(positionIterator.peek().getBlockEnd() <= blockAddress)
|
||||
positionIterator.next();
|
||||
}
|
||||
|
||||
return spansOverlapping;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
|
|
@ -111,20 +147,16 @@ class SAMReaderPosition {
|
|||
* @param filePosition The current position within the file.
|
||||
*/
|
||||
void advancePosition(final long filePosition) {
|
||||
nextBlockAddress = filePosition >> 16;
|
||||
nextBlockAddress = BlockCompressedFilePointerUtil.getBlockAddress(filePosition);
|
||||
|
||||
// Check the current file position against the iterator; if the iterator is before the current file position,
|
||||
// draw the iterator forward. Remember when performing the check that coordinates are half-open!
|
||||
while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek())) {
|
||||
while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek()))
|
||||
positionIterator.next();
|
||||
|
||||
// If the block iterator has shot past the file pointer, bring the file pointer flush with the start of the current block.
|
||||
if(positionIterator.hasNext() && filePosition < positionIterator.peek().getChunkStart()) {
|
||||
nextBlockAddress = positionIterator.peek().getBlockStart();
|
||||
//System.out.printf("SAMReaderPosition: next block address advanced to %d%n",nextBlockAddress);
|
||||
break;
|
||||
}
|
||||
}
|
||||
// If the block iterator has shot past the file pointer, bring the file pointer flush with the start of the current block.
|
||||
if(positionIterator.hasNext() && filePosition < positionIterator.peek().getChunkStart())
|
||||
nextBlockAddress = positionIterator.peek().getBlockStart();
|
||||
|
||||
// If we've shot off the end of the block pointer, notify consumers that iteration is complete.
|
||||
if(!positionIterator.hasNext())
|
||||
|
|
@ -407,7 +407,14 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
|
|||
position(currentPosition);
|
||||
|
||||
// Read data.
|
||||
read(binHeader);
|
||||
int binHeaderBytesRead = read(binHeader);
|
||||
|
||||
// Make sure we read in a complete bin header:
|
||||
if ( binHeaderBytesRead < INT_SIZE_IN_BYTES * 3 ) {
|
||||
throw new ReviewedStingException(String.format("Unable to read a complete bin header from BAM schedule file %s for BAM file %s. " +
|
||||
"The BAM schedule file is likely incomplete/corrupt.",
|
||||
scheduleFile.getAbsolutePath(), reader.getSamFilePath()));
|
||||
}
|
||||
|
||||
// Decode contents.
|
||||
binHeader.flip();
|
||||
|
|
|
|||
|
|
@ -34,6 +34,8 @@ import net.sf.samtools.SAMSequenceRecord;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -245,7 +247,14 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
// This will ensure that if the two sets of contigs don't quite match (b36 male vs female ref, hg19 Epstein-Barr), then
|
||||
// we'll be using the correct contig index for the BAMs.
|
||||
// TODO: Warning: assumes all BAMs use the same sequence dictionary! Get around this with contig aliasing.
|
||||
final int currentContigIndex = dataSource.getHeader().getSequence(currentLocus.getContig()).getSequenceIndex();
|
||||
SAMSequenceRecord currentContigSequenceRecord = dataSource.getHeader().getSequence(currentLocus.getContig());
|
||||
if ( currentContigSequenceRecord == null ) {
|
||||
throw new UserException(String.format("Contig %s not present in sequence dictionary for merged BAM header: %s",
|
||||
currentLocus.getContig(),
|
||||
ReadUtils.prettyPrintSequenceRecords(dataSource.getHeader().getSequenceDictionary())));
|
||||
}
|
||||
|
||||
final int currentContigIndex = currentContigSequenceRecord.getSequenceIndex();
|
||||
|
||||
// Stale reference sequence or first invocation. (Re)create the binTreeIterator.
|
||||
if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentContigIndex) {
|
||||
|
|
|
|||
|
|
@ -44,12 +44,12 @@ public class BGZFBlockLoadingDispatcher {
|
|||
|
||||
private final ExecutorService threadPool;
|
||||
|
||||
private final Queue<SAMReaderPosition> inputQueue;
|
||||
private final Queue<BAMAccessPlan> inputQueue;
|
||||
|
||||
public BGZFBlockLoadingDispatcher(final int numThreads, final int numFileHandles) {
|
||||
threadPool = Executors.newFixedThreadPool(numThreads);
|
||||
fileHandleCache = new FileHandleCache(numFileHandles);
|
||||
inputQueue = new LinkedList<SAMReaderPosition>();
|
||||
inputQueue = new LinkedList<BAMAccessPlan>();
|
||||
|
||||
threadPool.execute(new BlockLoader(this,fileHandleCache,true));
|
||||
}
|
||||
|
|
@ -58,7 +58,7 @@ public class BGZFBlockLoadingDispatcher {
|
|||
* Initiates a request for a new block load.
|
||||
* @param readerPosition Position at which to load.
|
||||
*/
|
||||
void queueBlockLoad(final SAMReaderPosition readerPosition) {
|
||||
void queueBlockLoad(final BAMAccessPlan readerPosition) {
|
||||
synchronized(inputQueue) {
|
||||
inputQueue.add(readerPosition);
|
||||
inputQueue.notify();
|
||||
|
|
@ -69,7 +69,7 @@ public class BGZFBlockLoadingDispatcher {
|
|||
* Claims the next work request from the queue.
|
||||
* @return The next work request, or null if none is available.
|
||||
*/
|
||||
SAMReaderPosition claimNextWorkRequest() {
|
||||
BAMAccessPlan claimNextWorkRequest() {
|
||||
synchronized(inputQueue) {
|
||||
while(inputQueue.isEmpty()) {
|
||||
try {
|
||||
|
|
|
|||
|
|
@ -26,24 +26,21 @@ package org.broadinstitute.sting.gatk.datasources.reads;
|
|||
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import net.sf.samtools.GATKChunk;
|
||||
import net.sf.samtools.util.BAMInputStream;
|
||||
import net.sf.samtools.util.BlockCompressedFilePointerUtil;
|
||||
import net.sf.samtools.util.BlockCompressedInputStream;
|
||||
import net.sf.samtools.util.RuntimeEOFException;
|
||||
import net.sf.samtools.util.SeekableStream;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Presents decompressed blocks to the SAMFileReader.
|
||||
*/
|
||||
public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
||||
public class BlockInputStream extends InputStream {
|
||||
/**
|
||||
* Mechanism for triggering block loads.
|
||||
*/
|
||||
|
|
@ -65,9 +62,9 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
|||
private Throwable error;
|
||||
|
||||
/**
|
||||
* Current position.
|
||||
* Current accessPlan.
|
||||
*/
|
||||
private SAMReaderPosition position;
|
||||
private BAMAccessPlan accessPlan;
|
||||
|
||||
/**
|
||||
* A stream of compressed data blocks.
|
||||
|
|
@ -94,11 +91,6 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
|||
*/
|
||||
private final BlockCompressedInputStream validatingInputStream;
|
||||
|
||||
/**
|
||||
* Has the buffer been filled since last request?
|
||||
*/
|
||||
private boolean bufferFilled = false;
|
||||
|
||||
/**
|
||||
* Create a new block presenting input stream with a dedicated buffer.
|
||||
* @param dispatcher the block loading messenger.
|
||||
|
|
@ -118,7 +110,7 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
|||
|
||||
this.dispatcher = dispatcher;
|
||||
// TODO: Kill the region when all we want to do is start at the beginning of the stream and run to the end of the stream.
|
||||
this.position = new SAMReaderPosition(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE)));
|
||||
this.accessPlan = new BAMAccessPlan(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE)));
|
||||
|
||||
// The block offsets / block positions guarantee that the ending offset/position in the data structure maps to
|
||||
// the point in the file just following the last read. These two arrays should never be empty; initializing
|
||||
|
|
@ -151,7 +143,7 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
|||
synchronized(lock) {
|
||||
// Find the current block within the input stream.
|
||||
int blockIndex;
|
||||
for(blockIndex = 0; blockIndex+1 < blockOffsets.size() && buffer.position() >= blockOffsets.get(blockIndex + 1); blockIndex++)
|
||||
for(blockIndex = 0; blockIndex+1 < blockOffsets.size() && buffer.position() > blockOffsets.get(blockIndex+1); blockIndex++)
|
||||
;
|
||||
filePointer = blockPositions.get(blockIndex) + (buffer.position()-blockOffsets.get(blockIndex));
|
||||
}
|
||||
|
|
@ -164,51 +156,8 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
|||
return filePointer;
|
||||
}
|
||||
|
||||
public void seek(long target) {
|
||||
//System.out.printf("Thread %s, BlockInputStream %s: seeking to block %d, offset %d%n",Thread.currentThread().getId(),this,BlockCompressedFilePointerUtil.getBlockAddress(target),BlockCompressedFilePointerUtil.getBlockOffset(target));
|
||||
synchronized(lock) {
|
||||
clearBuffers();
|
||||
|
||||
// Ensure that the position filled in by submitAccessPlan() is in sync with the seek target just specified.
|
||||
position.advancePosition(target);
|
||||
|
||||
// If the position advances past the end of the target, that must mean that we seeked to a point at the end
|
||||
// of one of the chunk list's subregions. Make a note of our current position and punt on loading any data.
|
||||
if(target < position.getBlockAddress() << 16) {
|
||||
blockOffsets.clear();
|
||||
blockOffsets.add(0);
|
||||
blockPositions.clear();
|
||||
blockPositions.add(target);
|
||||
}
|
||||
else {
|
||||
waitForBufferFill();
|
||||
// A buffer fill will load the relevant data from the shard, but the buffer position still needs to be
|
||||
// advanced as appropriate.
|
||||
Iterator<Integer> blockOffsetIterator = blockOffsets.descendingIterator();
|
||||
Iterator<Long> blockPositionIterator = blockPositions.descendingIterator();
|
||||
while(blockOffsetIterator.hasNext() && blockPositionIterator.hasNext()) {
|
||||
final int blockOffset = blockOffsetIterator.next();
|
||||
final long blockPosition = blockPositionIterator.next();
|
||||
if((blockPosition >> 16) == (target >> 16) && (blockPosition&0xFFFF) < (target&0xFFFF)) {
|
||||
buffer.position(blockOffset + (int)(target&0xFFFF)-(int)(blockPosition&0xFFFF));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(validatingInputStream != null) {
|
||||
try {
|
||||
validatingInputStream.seek(target);
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void clearBuffers() {
|
||||
this.position.reset();
|
||||
this.accessPlan.reset();
|
||||
|
||||
// Buffer semantics say that outside of a lock, buffer should always be prepared for reading.
|
||||
// Indicate no data to be read.
|
||||
|
|
@ -225,29 +174,41 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
|||
public boolean eof() {
|
||||
synchronized(lock) {
|
||||
// TODO: Handle multiple empty BGZF blocks at end of the file.
|
||||
return position != null && (position.getBlockAddress() < 0 || position.getBlockAddress() >= length);
|
||||
return accessPlan != null && (accessPlan.getBlockAddress() < 0 || accessPlan.getBlockAddress() >= length);
|
||||
}
|
||||
}
|
||||
|
||||
public void setCheckCrcs(final boolean check) {
|
||||
// TODO: Implement
|
||||
}
|
||||
|
||||
/**
|
||||
* Submits a new access plan for the given dataset.
|
||||
* @param position The next seek point for BAM data in this reader.
|
||||
* Submits a new access plan for the given dataset and seeks to the given point.
|
||||
* @param accessPlan The next seek point for BAM data in this reader.
|
||||
*/
|
||||
public void submitAccessPlan(final SAMReaderPosition position) {
|
||||
public void submitAccessPlan(final BAMAccessPlan accessPlan) {
|
||||
//System.out.printf("Thread %s: submitting access plan for block at position: %d%n",Thread.currentThread().getId(),position.getBlockAddress());
|
||||
synchronized(lock) {
|
||||
// Assume that the access plan is going to tell us to start where we are and move forward.
|
||||
// If this isn't the case, we'll soon receive a seek request and the buffer will be forced to reset.
|
||||
if(this.position != null && position.getBlockAddress() < this.position.getBlockAddress())
|
||||
position.advancePosition(this.position.getBlockAddress() << 16);
|
||||
this.accessPlan = accessPlan;
|
||||
accessPlan.reset();
|
||||
|
||||
clearBuffers();
|
||||
|
||||
// Pull the iterator past any oddball chunks at the beginning of the shard (chunkEnd < chunkStart, empty chunks, etc).
|
||||
// TODO: Don't pass these empty chunks in.
|
||||
accessPlan.advancePosition(makeFilePointer(accessPlan.getBlockAddress(),0));
|
||||
|
||||
if(accessPlan.getBlockAddress() >= 0) {
|
||||
waitForBufferFill();
|
||||
}
|
||||
this.position = position;
|
||||
|
||||
if(validatingInputStream != null) {
|
||||
try {
|
||||
validatingInputStream.seek(makeFilePointer(accessPlan.getBlockAddress(),0));
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void compactBuffer() {
|
||||
// Compact buffer to maximize storage space.
|
||||
int bytesToRemove = 0;
|
||||
|
|
@ -286,27 +247,14 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
|||
* Push contents of incomingBuffer into the end of this buffer.
|
||||
* MUST be called from a thread that is NOT the reader thread.
|
||||
* @param incomingBuffer The data being pushed into this input stream.
|
||||
* @param position target position for the data.
|
||||
* @param accessPlan target access plan for the data.
|
||||
* @param filePosition the current position of the file pointer
|
||||
*/
|
||||
public void copyIntoBuffer(final ByteBuffer incomingBuffer, final SAMReaderPosition position, final long filePosition) {
|
||||
public void copyIntoBuffer(final ByteBuffer incomingBuffer, final BAMAccessPlan accessPlan, final long filePosition) {
|
||||
synchronized(lock) {
|
||||
try {
|
||||
compactBuffer();
|
||||
// Open up the buffer for more reading.
|
||||
buffer.limit(buffer.capacity());
|
||||
|
||||
// Advance the position to take the most recent read into account.
|
||||
final long lastBlockAddress = position.getBlockAddress();
|
||||
final int blockOffsetStart = position.getFirstOffsetInBlock();
|
||||
final int blockOffsetEnd = position.getLastOffsetInBlock();
|
||||
|
||||
// Where did this read end? It either ended in the middle of a block (for a bounding chunk) or it ended at the start of the next block.
|
||||
final long endOfRead = (blockOffsetEnd < incomingBuffer.remaining()) ? (lastBlockAddress << 16) | blockOffsetEnd : filePosition << 16;
|
||||
|
||||
byte[] validBytes = null;
|
||||
if(validatingInputStream != null) {
|
||||
validBytes = new byte[incomingBuffer.remaining()];
|
||||
byte[] validBytes = new byte[incomingBuffer.remaining()];
|
||||
|
||||
byte[] currentBytes = new byte[incomingBuffer.remaining()];
|
||||
int pos = incomingBuffer.position();
|
||||
|
|
@ -317,7 +265,7 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
|||
incomingBuffer.position(pos);
|
||||
|
||||
long currentFilePointer = validatingInputStream.getFilePointer();
|
||||
validatingInputStream.seek(lastBlockAddress << 16);
|
||||
validatingInputStream.seek(makeFilePointer(accessPlan.getBlockAddress(), 0));
|
||||
validatingInputStream.read(validBytes);
|
||||
validatingInputStream.seek(currentFilePointer);
|
||||
|
||||
|
|
@ -325,33 +273,41 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
|||
throw new ReviewedStingException(String.format("Bytes being inserted into BlockInputStream %s are incorrect",this));
|
||||
}
|
||||
|
||||
this.position = position;
|
||||
position.advancePosition(filePosition << 16);
|
||||
compactBuffer();
|
||||
// Open up the buffer for more reading.
|
||||
buffer.limit(buffer.capacity());
|
||||
|
||||
if(buffer.remaining() < incomingBuffer.remaining()) {
|
||||
//System.out.printf("Thread %s: waiting for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n",Thread.currentThread().getId(),buffer.remaining(),incomingBuffer.remaining());
|
||||
// Get the spans overlapping this particular block...
|
||||
List<GATKChunk> spansOverlapping = accessPlan.getSpansOverlappingBlock(accessPlan.getBlockAddress(),filePosition);
|
||||
|
||||
// ...and advance the block
|
||||
this.accessPlan = accessPlan;
|
||||
accessPlan.advancePosition(makeFilePointer(filePosition, 0));
|
||||
|
||||
if(buffer.remaining() < incomingBuffer.remaining())
|
||||
lock.wait();
|
||||
//System.out.printf("Thread %s: waited for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n", Thread.currentThread().getId(), buffer.remaining(), incomingBuffer.remaining());
|
||||
|
||||
final int bytesInIncomingBuffer = incomingBuffer.limit();
|
||||
|
||||
for(GATKChunk spanOverlapping: spansOverlapping) {
|
||||
// Clear out the endcap tracking state and add in the starting position for this transfer.
|
||||
blockOffsets.removeLast();
|
||||
blockOffsets.add(buffer.position());
|
||||
blockPositions.removeLast();
|
||||
blockPositions.add(spanOverlapping.getChunkStart());
|
||||
|
||||
// Stream the buffer into the data stream.
|
||||
incomingBuffer.limit((spanOverlapping.getBlockEnd() > spanOverlapping.getBlockStart()) ? bytesInIncomingBuffer : spanOverlapping.getBlockOffsetEnd());
|
||||
incomingBuffer.position(spanOverlapping.getBlockOffsetStart());
|
||||
buffer.put(incomingBuffer);
|
||||
|
||||
// Add the endcap for this transfer.
|
||||
blockOffsets.add(buffer.position());
|
||||
blockPositions.add(spanOverlapping.getChunkEnd());
|
||||
}
|
||||
|
||||
// Remove the last position in the list and add in the last read position, in case the two are different.
|
||||
blockOffsets.removeLast();
|
||||
blockOffsets.add(buffer.position());
|
||||
blockPositions.removeLast();
|
||||
blockPositions.add(lastBlockAddress << 16 | blockOffsetStart);
|
||||
|
||||
// Stream the buffer into the data stream.
|
||||
incomingBuffer.position(blockOffsetStart);
|
||||
incomingBuffer.limit(Math.min(incomingBuffer.limit(),blockOffsetEnd));
|
||||
buffer.put(incomingBuffer);
|
||||
|
||||
// Then, add the last position read to the very end of the list, just past the end of the last buffer.
|
||||
blockOffsets.add(buffer.position());
|
||||
blockPositions.add(endOfRead);
|
||||
|
||||
// Set up the buffer for reading.
|
||||
buffer.flip();
|
||||
bufferFilled = true;
|
||||
|
||||
lock.notify();
|
||||
}
|
||||
|
|
@ -447,12 +403,8 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
|||
if(remaining < length)
|
||||
return length - remaining;
|
||||
|
||||
// Otherwise, if at eof(), return -1.
|
||||
else if(eof())
|
||||
return -1;
|
||||
|
||||
// Otherwise, we must've hit a bug in the system.
|
||||
throw new ReviewedStingException("BUG: read returned no data, but eof() reports false.");
|
||||
// Otherwise, return -1.
|
||||
return -1;
|
||||
}
|
||||
|
||||
public void close() {
|
||||
|
|
@ -472,20 +424,26 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
|||
|
||||
private void waitForBufferFill() {
|
||||
synchronized(lock) {
|
||||
bufferFilled = false;
|
||||
if(buffer.remaining() == 0 && !eof()) {
|
||||
//System.out.printf("Thread %s is waiting for a buffer fill from position %d to buffer %s%n",Thread.currentThread().getId(),position.getBlockAddress(),this);
|
||||
dispatcher.queueBlockLoad(position);
|
||||
dispatcher.queueBlockLoad(accessPlan);
|
||||
try {
|
||||
lock.wait();
|
||||
}
|
||||
catch(InterruptedException ex) {
|
||||
throw new ReviewedStingException("Interrupt occurred waiting for buffer to fill",ex);
|
||||
}
|
||||
|
||||
if(bufferFilled && buffer.remaining() == 0)
|
||||
throw new RuntimeEOFException("No more data left in InputStream");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an encoded BAM file pointer given the address of a BGZF block and an offset.
|
||||
* @param blockAddress Physical address on disk of a BGZF block.
|
||||
* @param blockOffset Offset into the uncompressed data stored in the BGZF block.
|
||||
* @return 64-bit pointer encoded according to the BAM spec.
|
||||
*/
|
||||
public static long makeFilePointer(final long blockAddress, final int blockOffset) {
|
||||
return blockAddress << 16 | blockOffset;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
|
|
@ -70,29 +70,29 @@ class BlockLoader implements Runnable {
|
|||
|
||||
public void run() {
|
||||
for(;;) {
|
||||
SAMReaderPosition readerPosition = null;
|
||||
BAMAccessPlan accessPlan = null;
|
||||
try {
|
||||
readerPosition = dispatcher.claimNextWorkRequest();
|
||||
FileInputStream inputStream = fileHandleCache.claimFileInputStream(readerPosition.getReader());
|
||||
accessPlan = dispatcher.claimNextWorkRequest();
|
||||
FileInputStream inputStream = fileHandleCache.claimFileInputStream(accessPlan.getReader());
|
||||
|
||||
long blockAddress = readerPosition.getBlockAddress();
|
||||
//long blockAddress = readerPosition.getBlockAddress();
|
||||
//System.out.printf("Thread %s: BlockLoader: copying bytes from %s at position %d into %s%n",Thread.currentThread().getId(),inputStream,blockAddress,readerPosition.getInputStream());
|
||||
|
||||
ByteBuffer compressedBlock = readBGZFBlock(inputStream,readerPosition.getBlockAddress());
|
||||
ByteBuffer compressedBlock = readBGZFBlock(inputStream,accessPlan.getBlockAddress());
|
||||
long nextBlockAddress = position(inputStream);
|
||||
fileHandleCache.releaseFileInputStream(readerPosition.getReader(),inputStream);
|
||||
fileHandleCache.releaseFileInputStream(accessPlan.getReader(),inputStream);
|
||||
|
||||
ByteBuffer block = decompress ? decompressBGZFBlock(compressedBlock) : compressedBlock;
|
||||
int bytesCopied = block.remaining();
|
||||
|
||||
BlockInputStream bamInputStream = readerPosition.getInputStream();
|
||||
bamInputStream.copyIntoBuffer(block,readerPosition,nextBlockAddress);
|
||||
BlockInputStream bamInputStream = accessPlan.getInputStream();
|
||||
bamInputStream.copyIntoBuffer(block,accessPlan,nextBlockAddress);
|
||||
|
||||
//System.out.printf("Thread %s: BlockLoader: copied %d bytes from %s at position %d into %s%n",Thread.currentThread().getId(),bytesCopied,inputStream,blockAddress,readerPosition.getInputStream());
|
||||
}
|
||||
catch(Throwable error) {
|
||||
if(readerPosition != null && readerPosition.getInputStream() != null)
|
||||
readerPosition.getInputStream().reportException(error);
|
||||
if(accessPlan != null && accessPlan.getInputStream() != null)
|
||||
accessPlan.getInputStream().reportException(error);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.datasources.reads;
|
|||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
|
|
@ -349,7 +350,18 @@ public class GATKBAMIndex {
|
|||
|
||||
private void read(final ByteBuffer buffer) {
|
||||
try {
|
||||
fileChannel.read(buffer);
|
||||
int bytesExpected = buffer.limit();
|
||||
int bytesRead = fileChannel.read(buffer);
|
||||
|
||||
// We have a rigid expectation here to read in exactly the number of bytes we've limited
|
||||
// our buffer to -- if we read in fewer bytes than this, or encounter EOF (-1), the index
|
||||
// must be truncated or otherwise corrupt:
|
||||
if ( bytesRead < bytesExpected ) {
|
||||
throw new UserException.MalformedFile(mFile, String.format("Premature end-of-file while reading BAM index file %s. " +
|
||||
"It's likely that this file is truncated or corrupt -- " +
|
||||
"Please try re-indexing the corresponding BAM file.",
|
||||
mFile));
|
||||
}
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new ReviewedStingException("Index: unable to read bytes from index file " + mFile);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,203 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* High efficiency filtering iterator designed to filter out reads only included
|
||||
* in the query results due to the granularity of the BAM index.
|
||||
*
|
||||
* Built into the BAM index is a notion of 16kbase granularity -- an index query for
|
||||
* two regions contained within a 16kbase chunk (say, chr1:5-10 and chr1:11-20) will
|
||||
* return exactly the same regions within the BAM file. This iterator is optimized
|
||||
* to subtract out reads which do not at all overlap the interval list passed to the
|
||||
* constructor.
|
||||
*
|
||||
* Example:
|
||||
* interval list: chr20:6-10
|
||||
* Reads that would pass through the filter: chr20:6-10, chr20:1-15, chr20:1-7, chr20:8-15.
|
||||
* Reads that would be discarded by the filter: chr20:1-5, chr20:11-15.
|
||||
*/
|
||||
class IntervalOverlapFilteringIterator implements CloseableIterator<SAMRecord> {
|
||||
/**
|
||||
* The wrapped iterator.
|
||||
*/
|
||||
private CloseableIterator<SAMRecord> iterator;
|
||||
|
||||
/**
|
||||
* The next read, queued up and ready to go.
|
||||
*/
|
||||
private SAMRecord nextRead;
|
||||
|
||||
/**
|
||||
* Rather than using the straight genomic bounds, use filter out only mapped reads.
|
||||
*/
|
||||
private boolean keepOnlyUnmappedReads;
|
||||
|
||||
/**
|
||||
* Custom representation of interval bounds.
|
||||
* Makes it simpler to track current position.
|
||||
*/
|
||||
private int[] intervalContigIndices;
|
||||
private int[] intervalStarts;
|
||||
private int[] intervalEnds;
|
||||
|
||||
/**
|
||||
* Position within the interval list.
|
||||
*/
|
||||
private int currentBound = 0;
|
||||
|
||||
public IntervalOverlapFilteringIterator(CloseableIterator<SAMRecord> iterator, List<GenomeLoc> intervals) {
|
||||
this.iterator = iterator;
|
||||
|
||||
// Look at the interval list to detect whether we should worry about unmapped reads.
|
||||
// If we find a mix of mapped/unmapped intervals, throw an exception.
|
||||
boolean foundMappedIntervals = false;
|
||||
for(GenomeLoc location: intervals) {
|
||||
if(! GenomeLoc.isUnmapped(location))
|
||||
foundMappedIntervals = true;
|
||||
keepOnlyUnmappedReads |= GenomeLoc.isUnmapped(location);
|
||||
}
|
||||
|
||||
|
||||
if(foundMappedIntervals) {
|
||||
if(keepOnlyUnmappedReads)
|
||||
throw new ReviewedStingException("Tried to apply IntervalOverlapFilteringIterator to a mixed of mapped and unmapped intervals. Please apply this filter to only mapped or only unmapped reads");
|
||||
this.intervalContigIndices = new int[intervals.size()];
|
||||
this.intervalStarts = new int[intervals.size()];
|
||||
this.intervalEnds = new int[intervals.size()];
|
||||
int i = 0;
|
||||
for(GenomeLoc interval: intervals) {
|
||||
intervalContigIndices[i] = interval.getContigIndex();
|
||||
intervalStarts[i] = interval.getStart();
|
||||
intervalEnds[i] = interval.getStop();
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
advance();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return nextRead != null;
|
||||
}
|
||||
|
||||
public SAMRecord next() {
|
||||
if(nextRead == null)
|
||||
throw new NoSuchElementException("No more reads left in this iterator.");
|
||||
SAMRecord currentRead = nextRead;
|
||||
advance();
|
||||
return currentRead;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Cannot remove from an IntervalOverlapFilteringIterator");
|
||||
}
|
||||
|
||||
|
||||
public void close() {
|
||||
iterator.close();
|
||||
}
|
||||
|
||||
private void advance() {
|
||||
nextRead = null;
|
||||
|
||||
if(!iterator.hasNext())
|
||||
return;
|
||||
|
||||
SAMRecord candidateRead = iterator.next();
|
||||
while(nextRead == null && (keepOnlyUnmappedReads || currentBound < intervalStarts.length)) {
|
||||
if(!keepOnlyUnmappedReads) {
|
||||
// Mapped read filter; check against GenomeLoc-derived bounds.
|
||||
if(readEndsOnOrAfterStartingBound(candidateRead)) {
|
||||
// This read ends after the current interval begins.
|
||||
// Promising, but this read must be checked against the ending bound.
|
||||
if(readStartsOnOrBeforeEndingBound(candidateRead)) {
|
||||
// Yes, this read is within both bounds. This must be our next read.
|
||||
nextRead = candidateRead;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
// Oops, we're past the end bound. Increment the current bound and try again.
|
||||
currentBound++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Found an unmapped read. We're done.
|
||||
if(candidateRead.getReadUnmappedFlag()) {
|
||||
nextRead = candidateRead;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// No more reads available. Stop the search.
|
||||
if(!iterator.hasNext())
|
||||
break;
|
||||
|
||||
// No reasonable read found; advance the iterator.
|
||||
candidateRead = iterator.next();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether the read lies after the start of the current bound. If the read is unmapped but placed, its
|
||||
* end will be distorted, so rely only on the alignment start.
|
||||
* @param read The read to position-check.
|
||||
* @return True if the read starts after the current bounds. False otherwise.
|
||||
*/
|
||||
private boolean readEndsOnOrAfterStartingBound(final SAMRecord read) {
|
||||
return
|
||||
// Read ends on a later contig, or...
|
||||
read.getReferenceIndex() > intervalContigIndices[currentBound] ||
|
||||
// Read ends of this contig...
|
||||
(read.getReferenceIndex() == intervalContigIndices[currentBound] &&
|
||||
// either after this location, or...
|
||||
(read.getAlignmentEnd() >= intervalStarts[currentBound] ||
|
||||
// read is unmapped but positioned and alignment start is on or after this start point.
|
||||
(read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound])));
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether the read lies before the end of the current bound.
|
||||
* @param read The read to position-check.
|
||||
* @return True if the read starts after the current bounds. False otherwise.
|
||||
*/
|
||||
private boolean readStartsOnOrBeforeEndingBound(final SAMRecord read) {
|
||||
return
|
||||
// Read starts on a prior contig, or...
|
||||
read.getReferenceIndex() < intervalContigIndices[currentBound] ||
|
||||
// Read starts on this contig and the alignment start is registered before this end point.
|
||||
(read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]);
|
||||
}
|
||||
}
|
||||
|
|
@ -36,7 +36,7 @@ import java.util.Map;
|
|||
*/
|
||||
public class ReadShard extends Shard {
|
||||
/**
|
||||
* What is the maximum number of reads which should go into a read shard.
|
||||
* What is the maximum number of reads per BAM file which should go into a read shard.
|
||||
*/
|
||||
public static int MAX_READS = 10000;
|
||||
|
||||
|
|
|
|||
|
|
@ -39,7 +39,6 @@ import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator;
|
|||
import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
||||
import org.broadinstitute.sting.gatk.iterators.*;
|
||||
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.SimpleTimer;
|
||||
|
|
@ -47,6 +46,8 @@ import org.broadinstitute.sting.utils.baq.BAQ;
|
|||
import org.broadinstitute.sting.utils.baq.BAQSamIterator;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.recalibration.BQSRSamIterator;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -202,6 +203,7 @@ public class SAMDataSource {
|
|||
BAQ.CalculationMode.OFF,
|
||||
BAQ.QualityMode.DONT_MODIFY,
|
||||
null, // no BAQ
|
||||
null, // no BQSR
|
||||
(byte) -1);
|
||||
}
|
||||
|
||||
|
|
@ -238,6 +240,7 @@ public class SAMDataSource {
|
|||
BAQ.CalculationMode cmode,
|
||||
BAQ.QualityMode qmode,
|
||||
IndexedFastaSequenceFile refReader,
|
||||
BaseRecalibration bqsrApplier,
|
||||
byte defaultBaseQualities) {
|
||||
this.readMetrics = new ReadMetrics();
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
|
|
@ -310,6 +313,7 @@ public class SAMDataSource {
|
|||
cmode,
|
||||
qmode,
|
||||
refReader,
|
||||
bqsrApplier,
|
||||
defaultBaseQualities);
|
||||
|
||||
// cache the read group id (original) -> read group id (merged)
|
||||
|
|
@ -555,7 +559,7 @@ public class SAMDataSource {
|
|||
*/
|
||||
private StingSAMIterator getIterator(SAMReaders readers, Shard shard, boolean enableVerification) {
|
||||
// Set up merging to dynamically merge together multiple BAMs.
|
||||
MergingSamRecordIterator mergingIterator = readers.createMergingIterator();
|
||||
Map<SAMFileReader,CloseableIterator<SAMRecord>> iteratorMap = new HashMap<SAMFileReader,CloseableIterator<SAMRecord>>();
|
||||
|
||||
for(SAMReaderID id: getReaderIDs()) {
|
||||
CloseableIterator<SAMRecord> iterator = null;
|
||||
|
|
@ -567,15 +571,23 @@ public class SAMDataSource {
|
|||
|
||||
if(threadAllocation.getNumIOThreads() > 0) {
|
||||
BlockInputStream inputStream = readers.getInputStream(id);
|
||||
inputStream.submitAccessPlan(new SAMReaderPosition(id,inputStream,(GATKBAMFileSpan)shard.getFileSpans().get(id)));
|
||||
inputStream.submitAccessPlan(new BAMAccessPlan(id, inputStream, (GATKBAMFileSpan) shard.getFileSpans().get(id)));
|
||||
BAMRecordCodec codec = new BAMRecordCodec(getHeader(id),factory);
|
||||
codec.setInputStream(inputStream);
|
||||
iterator = new BAMCodecIterator(inputStream,readers.getReader(id),codec);
|
||||
}
|
||||
iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id));
|
||||
else {
|
||||
iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id));
|
||||
}
|
||||
|
||||
iterator = new MalformedBAMErrorReformatingIterator(id.samFile, iterator);
|
||||
if(shard.getGenomeLocs().size() > 0)
|
||||
iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs());
|
||||
mergingIterator.addIterator(readers.getReader(id),iterator);
|
||||
iteratorMap.put(readers.getReader(id), iterator);
|
||||
}
|
||||
|
||||
MergingSamRecordIterator mergingIterator = readers.createMergingIterator(iteratorMap);
|
||||
|
||||
return applyDecoratingIterators(shard.getReadMetrics(),
|
||||
enableVerification,
|
||||
readProperties.useOriginalBaseQualities(),
|
||||
|
|
@ -586,9 +598,53 @@ public class SAMDataSource {
|
|||
readProperties.getBAQCalculationMode(),
|
||||
readProperties.getBAQQualityMode(),
|
||||
readProperties.getRefReader(),
|
||||
readProperties.getBQSRApplier(),
|
||||
readProperties.defaultBaseQualities());
|
||||
}
|
||||
|
||||
private class BAMCodecIterator implements CloseableIterator<SAMRecord> {
|
||||
private final BlockInputStream inputStream;
|
||||
private final SAMFileReader reader;
|
||||
private final BAMRecordCodec codec;
|
||||
private SAMRecord nextRead;
|
||||
|
||||
private BAMCodecIterator(final BlockInputStream inputStream, final SAMFileReader reader, final BAMRecordCodec codec) {
|
||||
this.inputStream = inputStream;
|
||||
this.reader = reader;
|
||||
this.codec = codec;
|
||||
advance();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return nextRead != null;
|
||||
}
|
||||
|
||||
public SAMRecord next() {
|
||||
if(!hasNext())
|
||||
throw new NoSuchElementException("Unable to retrieve next record from BAMCodecIterator; input stream is empty");
|
||||
SAMRecord currentRead = nextRead;
|
||||
advance();
|
||||
return currentRead;
|
||||
}
|
||||
|
||||
public void close() {
|
||||
// NO-OP.
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Unable to remove from BAMCodecIterator");
|
||||
}
|
||||
|
||||
private void advance() {
|
||||
final long startCoordinate = inputStream.getFilePointer();
|
||||
nextRead = codec.decode();
|
||||
final long stopCoordinate = inputStream.getFilePointer();
|
||||
|
||||
if(reader != null && nextRead != null)
|
||||
PicardNamespaceUtils.setFileSource(nextRead,new SAMFileSource(reader,new GATKBAMFileSpan(new GATKChunk(startCoordinate,stopCoordinate))));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter reads based on user-specified criteria.
|
||||
*
|
||||
|
|
@ -612,9 +668,10 @@ public class SAMDataSource {
|
|||
BAQ.CalculationMode cmode,
|
||||
BAQ.QualityMode qmode,
|
||||
IndexedFastaSequenceFile refReader,
|
||||
BaseRecalibration bqsrApplier,
|
||||
byte defaultBaseQualities) {
|
||||
if ( useOriginalBaseQualities || defaultBaseQualities >= 0 )
|
||||
// only wrap if we are replacing the original qualitiies or using a default base quality
|
||||
if (useOriginalBaseQualities || defaultBaseQualities >= 0)
|
||||
// only wrap if we are replacing the original qualities or using a default base quality
|
||||
wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities);
|
||||
|
||||
// NOTE: this (and other filtering) should be done before on-the-fly sorting
|
||||
|
|
@ -627,6 +684,9 @@ public class SAMDataSource {
|
|||
if (!noValidationOfReadOrder && enableVerification)
|
||||
wrappedIterator = new VerifyingSamIterator(genomeLocParser,wrappedIterator);
|
||||
|
||||
if (bqsrApplier != null)
|
||||
wrappedIterator = new BQSRSamIterator(wrappedIterator, bqsrApplier);
|
||||
|
||||
if (cmode != BAQ.CalculationMode.OFF)
|
||||
wrappedIterator = new BAQSamIterator(refReader, wrappedIterator, cmode, qmode);
|
||||
|
||||
|
|
@ -796,8 +856,13 @@ public class SAMDataSource {
|
|||
return headerMerger.getReadGroupId(header,originalReadGroupID);
|
||||
}
|
||||
|
||||
public MergingSamRecordIterator createMergingIterator() {
|
||||
return new MergingSamRecordIterator(headerMerger,readers.values(),true);
|
||||
/**
|
||||
* Creates a new merging iterator from the given map, with the given header.
|
||||
* @param iteratorMap A map of readers to iterators.
|
||||
* @return An iterator which will merge those individual iterators.
|
||||
*/
|
||||
public MergingSamRecordIterator createMergingIterator(final Map<SAMFileReader,CloseableIterator<SAMRecord>> iteratorMap) {
|
||||
return new MergingSamRecordIterator(headerMerger,iteratorMap,true);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -863,12 +928,9 @@ public class SAMDataSource {
|
|||
public ReaderInitializer call() {
|
||||
final File indexFile = findIndexFile(readerID.samFile);
|
||||
try {
|
||||
if (threadAllocation.getNumIOThreads() > 0) {
|
||||
if (threadAllocation.getNumIOThreads() > 0)
|
||||
blockInputStream = new BlockInputStream(dispatcher,readerID,false);
|
||||
reader = new SAMFileReader(blockInputStream,indexFile,false);
|
||||
}
|
||||
else
|
||||
reader = new SAMFileReader(readerID.samFile,indexFile,false);
|
||||
reader = new SAMFileReader(readerID.samFile,indexFile,false);
|
||||
} catch ( RuntimeIOException e ) {
|
||||
if ( e.getCause() != null && e.getCause() instanceof FileNotFoundException )
|
||||
throw new UserException.CouldNotReadInputFile(readerID.samFile, e);
|
||||
|
|
@ -927,167 +989,6 @@ public class SAMDataSource {
|
|||
*/
|
||||
private class ReadGroupMapping extends HashMap<String,String> {}
|
||||
|
||||
/**
|
||||
* Filters out reads that do not overlap the current GenomeLoc.
|
||||
* Note the custom implementation: BAM index querying returns all reads that could
|
||||
* possibly overlap the given region (and quite a few extras). In order not to drag
|
||||
* down performance, this implementation is highly customized to its task.
|
||||
*/
|
||||
private class IntervalOverlapFilteringIterator implements CloseableIterator<SAMRecord> {
|
||||
/**
|
||||
* The wrapped iterator.
|
||||
*/
|
||||
private CloseableIterator<SAMRecord> iterator;
|
||||
|
||||
/**
|
||||
* The next read, queued up and ready to go.
|
||||
*/
|
||||
private SAMRecord nextRead;
|
||||
|
||||
/**
|
||||
* Rather than using the straight genomic bounds, use filter out only mapped reads.
|
||||
*/
|
||||
private boolean keepOnlyUnmappedReads;
|
||||
|
||||
/**
|
||||
* Custom representation of interval bounds.
|
||||
* Makes it simpler to track current position.
|
||||
*/
|
||||
private int[] intervalContigIndices;
|
||||
private int[] intervalStarts;
|
||||
private int[] intervalEnds;
|
||||
|
||||
/**
|
||||
* Position within the interval list.
|
||||
*/
|
||||
private int currentBound = 0;
|
||||
|
||||
public IntervalOverlapFilteringIterator(CloseableIterator<SAMRecord> iterator, List<GenomeLoc> intervals) {
|
||||
this.iterator = iterator;
|
||||
|
||||
// Look at the interval list to detect whether we should worry about unmapped reads.
|
||||
// If we find a mix of mapped/unmapped intervals, throw an exception.
|
||||
boolean foundMappedIntervals = false;
|
||||
for(GenomeLoc location: intervals) {
|
||||
if(! GenomeLoc.isUnmapped(location))
|
||||
foundMappedIntervals = true;
|
||||
keepOnlyUnmappedReads |= GenomeLoc.isUnmapped(location);
|
||||
}
|
||||
|
||||
|
||||
if(foundMappedIntervals) {
|
||||
if(keepOnlyUnmappedReads)
|
||||
throw new ReviewedStingException("Tried to apply IntervalOverlapFilteringIterator to a mixed of mapped and unmapped intervals. Please apply this filter to only mapped or only unmapped reads");
|
||||
this.intervalContigIndices = new int[intervals.size()];
|
||||
this.intervalStarts = new int[intervals.size()];
|
||||
this.intervalEnds = new int[intervals.size()];
|
||||
int i = 0;
|
||||
for(GenomeLoc interval: intervals) {
|
||||
intervalContigIndices[i] = interval.getContigIndex();
|
||||
intervalStarts[i] = interval.getStart();
|
||||
intervalEnds[i] = interval.getStop();
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
advance();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return nextRead != null;
|
||||
}
|
||||
|
||||
public SAMRecord next() {
|
||||
if(nextRead == null)
|
||||
throw new NoSuchElementException("No more reads left in this iterator.");
|
||||
SAMRecord currentRead = nextRead;
|
||||
advance();
|
||||
return currentRead;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Cannot remove from an IntervalOverlapFilteringIterator");
|
||||
}
|
||||
|
||||
|
||||
public void close() {
|
||||
iterator.close();
|
||||
}
|
||||
|
||||
private void advance() {
|
||||
nextRead = null;
|
||||
|
||||
if(!iterator.hasNext())
|
||||
return;
|
||||
|
||||
SAMRecord candidateRead = iterator.next();
|
||||
while(nextRead == null && (keepOnlyUnmappedReads || currentBound < intervalStarts.length)) {
|
||||
if(!keepOnlyUnmappedReads) {
|
||||
// Mapped read filter; check against GenomeLoc-derived bounds.
|
||||
if(readEndsOnOrAfterStartingBound(candidateRead)) {
|
||||
// This read ends after the current interval begins.
|
||||
// Promising, but this read must be checked against the ending bound.
|
||||
if(readStartsOnOrBeforeEndingBound(candidateRead)) {
|
||||
// Yes, this read is within both bounds. This must be our next read.
|
||||
nextRead = candidateRead;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
// Oops, we're past the end bound. Increment the current bound and try again.
|
||||
currentBound++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Found an unmapped read. We're done.
|
||||
if(candidateRead.getReadUnmappedFlag()) {
|
||||
nextRead = candidateRead;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// No more reads available. Stop the search.
|
||||
if(!iterator.hasNext())
|
||||
break;
|
||||
|
||||
// No reasonable read found; advance the iterator.
|
||||
candidateRead = iterator.next();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether the read lies after the start of the current bound. If the read is unmapped but placed, its
|
||||
* end will be distorted, so rely only on the alignment start.
|
||||
* @param read The read to position-check.
|
||||
* @return True if the read starts after the current bounds. False otherwise.
|
||||
*/
|
||||
private boolean readEndsOnOrAfterStartingBound(final SAMRecord read) {
|
||||
return
|
||||
// Read ends on a later contig, or...
|
||||
read.getReferenceIndex() > intervalContigIndices[currentBound] ||
|
||||
// Read ends of this contig...
|
||||
(read.getReferenceIndex() == intervalContigIndices[currentBound] &&
|
||||
// either after this location, or...
|
||||
(read.getAlignmentEnd() >= intervalStarts[currentBound] ||
|
||||
// read is unmapped but positioned and alignment start is on or after this start point.
|
||||
(read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound])));
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether the read lies before the end of the current bound.
|
||||
* @param read The read to position-check.
|
||||
* @return True if the read starts after the current bounds. False otherwise.
|
||||
*/
|
||||
private boolean readStartsOnOrBeforeEndingBound(final SAMRecord read) {
|
||||
return
|
||||
// Read starts on a prior contig, or...
|
||||
read.getReferenceIndex() < intervalContigIndices[currentBound] ||
|
||||
// Read starts on this contig and the alignment start is registered before this end point.
|
||||
(read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Locates the index file alongside the given BAM, if present.
|
||||
* TODO: This is currently a hachetjob that reaches into Picard and pulls out its index file locator. Replace with something more permanent.
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker;
|
|||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.threading.ThreadPoolMonitor;
|
||||
|
||||
import java.util.Collection;
|
||||
|
|
@ -102,7 +102,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
|
|||
while (isShardTraversePending() || isTreeReducePending()) {
|
||||
// Check for errors during execution.
|
||||
if(hasTraversalErrorOccurred())
|
||||
throw new ReviewedStingException("An error has occurred during the traversal.",getTraversalError());
|
||||
throw getTraversalError();
|
||||
|
||||
// Too many files sitting around taking up space? Merge them.
|
||||
if (isMergeLimitExceeded())
|
||||
|
|
@ -345,10 +345,15 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
|
|||
return error != null;
|
||||
}
|
||||
|
||||
private synchronized Throwable getTraversalError() {
|
||||
private synchronized StingException getTraversalError() {
|
||||
if(!hasTraversalErrorOccurred())
|
||||
throw new ReviewedStingException("User has attempted to retrieve a traversal error when none exists");
|
||||
return error;
|
||||
|
||||
// If the error is already a StingException, pass it along as is. Otherwise, wrap it.
|
||||
if(error instanceof StingException)
|
||||
return (StingException)error;
|
||||
else
|
||||
return new ReviewedStingException("An error occurred during the traversal.",error);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
|||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
|
||||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraverseActiveRegions;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
|
|
@ -55,7 +56,6 @@ public class LinearMicroScheduler extends MicroScheduler {
|
|||
|
||||
traversalEngine.startTimersIfNecessary();
|
||||
if(shard.getShardType() == Shard.ShardType.LOCUS) {
|
||||
LocusWalker lWalker = (LocusWalker)walker;
|
||||
WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(),
|
||||
getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine));
|
||||
for(WindowMaker.WindowMakerIterator iterator: windowMaker) {
|
||||
|
|
@ -77,6 +77,12 @@ public class LinearMicroScheduler extends MicroScheduler {
|
|||
done = walker.isDone();
|
||||
}
|
||||
|
||||
// Special function call to empty out the work queue. Ugly for now but will be cleaned up when we eventually push this functionality more into the engine
|
||||
if( traversalEngine instanceof TraverseActiveRegions ) {
|
||||
final Object result = ((TraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit());
|
||||
accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator
|
||||
}
|
||||
|
||||
Object result = accumulator.finishTraversal();
|
||||
|
||||
printOnTraversalDone(result);
|
||||
|
|
|
|||
|
|
@ -128,6 +128,8 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
|
|||
traversalEngine = new TraverseDuplicates();
|
||||
} else if (walker instanceof ReadPairWalker) {
|
||||
traversalEngine = new TraverseReadPairs();
|
||||
} else if (walker instanceof ActiveRegionWalker) {
|
||||
traversalEngine = new TraverseActiveRegions();
|
||||
} else {
|
||||
throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type.");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -17,9 +17,21 @@ import java.util.List;
|
|||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* Buffer shards of data which may or may not contain multiple loci into
|
||||
* iterators of all data which cover an interval. Its existence is an homage
|
||||
* to Mark's stillborn WindowMaker, RIP 2009.
|
||||
* Transforms an iterator of reads which overlap the given interval list into an iterator of covered single-base loci
|
||||
* completely contained within the interval list. To do this, it creates a LocusIteratorByState which will emit a single-bp
|
||||
* locus for every base covered by the read iterator, then uses the WindowMakerIterator.advance() to filter down that stream of
|
||||
* loci to only those covered by the given interval list.
|
||||
*
|
||||
* Example:
|
||||
* Incoming stream of reads: A:chr20:1-5, B:chr20:2-6, C:chr20:2-7, D:chr20:3-8, E:chr20:5-10
|
||||
* Incoming intervals: chr20:3-7
|
||||
*
|
||||
* Locus iterator by state will produce the following stream of data:
|
||||
* chr1:1 {A}, chr1:2 {A,B,C}, chr1:3 {A,B,C,D}, chr1:4 {A,B,C,D}, chr1:5 {A,B,C,D,E},
|
||||
* chr1:6 {B,C,D,E}, chr1:7 {C,D,E}, chr1:8 {D,E}, chr1:9 {E}, chr1:10 {E}
|
||||
*
|
||||
* WindowMakerIterator will then filter the incoming stream, emitting the following stream:
|
||||
* chr1:3 {A,B,C,D}, chr1:4 {A,B,C,D}, chr1:5 {A,B,C,D,E}, chr1:6 {B,C,D,E}, chr1:7 {C,D,E}
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
|
|
|
|||
|
|
@ -49,9 +49,13 @@ import org.broadinstitute.sting.utils.sam.ReadUtils;
|
|||
|
||||
import java.util.*;
|
||||
|
||||
/** Iterator that traverses a SAM File, accumulating information on a per-locus basis */
|
||||
/**
|
||||
* Iterator that traverses a SAM File, accumulating information on a per-locus basis
|
||||
*/
|
||||
public class LocusIteratorByState extends LocusIterator {
|
||||
/** our log, which we want to capture anything from this class */
|
||||
/**
|
||||
* our log, which we want to capture anything from this class
|
||||
*/
|
||||
private static Logger logger = Logger.getLogger(LocusIteratorByState.class);
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -70,16 +74,15 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
|
||||
static private class SAMRecordState {
|
||||
SAMRecord read;
|
||||
int readOffset = -1; // how far are we offset from the start of the read bases?
|
||||
int genomeOffset = -1; // how far are we offset from the alignment start on the genome?
|
||||
int readOffset = -1; // how far are we offset from the start of the read bases?
|
||||
int genomeOffset = -1; // how far are we offset from the alignment start on the genome?
|
||||
|
||||
Cigar cigar = null;
|
||||
int cigarOffset = -1;
|
||||
CigarElement curElement = null;
|
||||
int nCigarElements = 0;
|
||||
|
||||
// how far are we into a single cigarElement
|
||||
int cigarElementCounter = -1;
|
||||
int cigarElementCounter = -1; // how far are we into a single cigarElement
|
||||
|
||||
// The logical model for generating extended events is as follows: the "record state" implements the traversal
|
||||
// along the reference; thus stepForwardOnGenome() returns on every and only on actual reference bases. This
|
||||
|
|
@ -89,17 +92,19 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
// stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended
|
||||
// events immediately preceding the current reference base).
|
||||
|
||||
boolean generateExtendedEvents = true; // should we generate an additional, special pile for indels between the ref bases?
|
||||
// the only purpose of this flag is to shield away a few additional lines of code
|
||||
// when extended piles are not needed, it may not be even worth it...
|
||||
byte[] insertedBases = null; // remember full inserted sequence if we are generating piles of extended events (indels)
|
||||
int eventLength = -1; // will be set to the length of insertion/deletion if we are generating piles of extended events
|
||||
byte eventDelayedFlag = 0; // will be set to non-0 if there was an event (indel) right before the
|
||||
// current base on the ref. We use a counter-like variable here since clearing the indel event is
|
||||
// delayed by one base, so we need to remember how long ago we have seen the actual event
|
||||
int eventStart = -1; // where on the read the extended event starts (i.e. the last position on the read prior to the
|
||||
// event, or -1 if alignment starts with an insertion); this one is easy to recompute on the fly,
|
||||
// we cache it here mainly for convenience
|
||||
boolean generateExtendedEvents = true; // should we generate an additional, special pile for indels between the ref bases?
|
||||
// the only purpose of this flag is to shield away a few additional lines of code
|
||||
// when extended piles are not needed, it may not be even worth it...
|
||||
|
||||
byte[] insertedBases = null; // remember full inserted sequence if we are generating piles of extended events (indels)
|
||||
int eventLength = -1; // will be set to the length of insertion/deletion if we are generating piles of extended events
|
||||
byte eventDelayedFlag = 0; // will be set to non-0 if there was an event (indel) right before the
|
||||
// current base on the ref. We use a counter-like variable here since clearing the indel event is
|
||||
// delayed by one base, so we need to remember how long ago we have seen the actual event
|
||||
|
||||
int eventStart = -1; // where on the read the extended event starts (i.e. the last position on the read prior to the
|
||||
// event, or -1 if alignment starts with an insertion); this one is easy to recompute on the fly,
|
||||
// we cache it here mainly for convenience
|
||||
|
||||
|
||||
public SAMRecordState(SAMRecord read, boolean extended) {
|
||||
|
|
@ -111,23 +116,31 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
//System.out.printf("Creating a SAMRecordState: %s%n", this);
|
||||
}
|
||||
|
||||
public SAMRecord getRead() { return read; }
|
||||
public SAMRecord getRead() {
|
||||
return read;
|
||||
}
|
||||
|
||||
/**
|
||||
* What is our current offset in the read's bases that aligns us with the reference genome?
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public int getReadOffset() { return readOffset; }
|
||||
public int getReadOffset() {
|
||||
return readOffset;
|
||||
}
|
||||
|
||||
/**
|
||||
* What is the current offset w.r.t. the alignment state that aligns us to the readOffset?
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public int getGenomeOffset() { return genomeOffset; }
|
||||
public int getGenomeOffset() {
|
||||
return genomeOffset;
|
||||
}
|
||||
|
||||
public int getGenomePosition() { return read.getAlignmentStart() + getGenomeOffset(); }
|
||||
public int getGenomePosition() {
|
||||
return read.getAlignmentStart() + getGenomeOffset();
|
||||
}
|
||||
|
||||
public GenomeLoc getLocation(GenomeLocParser genomeLocParser) {
|
||||
return genomeLocParser.createGenomeLoc(read.getReferenceName(), getGenomePosition());
|
||||
|
|
@ -137,52 +150,66 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
return curElement.getOperator();
|
||||
}
|
||||
|
||||
/** Returns true if we just stepped over insertion/into a deletion prior to the last return from stepForwardOnGenome.
|
||||
/**
|
||||
* Returns true if we just stepped over insertion/into a deletion prior to the last return from stepForwardOnGenome.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public boolean hadIndel() {
|
||||
return ( eventLength > 0 );
|
||||
return (eventLength > 0);
|
||||
}
|
||||
|
||||
public int getEventLength() { return eventLength; }
|
||||
public int getEventLength() {
|
||||
return eventLength;
|
||||
}
|
||||
|
||||
public byte[] getEventBases() { return insertedBases; }
|
||||
public byte[] getEventBases() {
|
||||
return insertedBases;
|
||||
}
|
||||
|
||||
public int getReadEventStartOffset() { return eventStart; }
|
||||
public int getReadEventStartOffset() {
|
||||
return eventStart;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement);
|
||||
}
|
||||
|
||||
public CigarElement peekForwardOnGenome() {
|
||||
return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement );
|
||||
}
|
||||
|
||||
public CigarOperator stepForwardOnGenome() {
|
||||
// we enter this method with readOffset = index of the last processed base on the read
|
||||
// (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion
|
||||
|
||||
|
||||
if ( curElement == null || ++cigarElementCounter > curElement.getLength() ) {
|
||||
if (curElement == null || ++cigarElementCounter > curElement.getLength()) {
|
||||
cigarOffset++;
|
||||
if ( cigarOffset < nCigarElements ) {
|
||||
if (cigarOffset < nCigarElements) {
|
||||
curElement = cigar.getCigarElement(cigarOffset);
|
||||
cigarElementCounter = 0;
|
||||
// next line: guards against cigar elements of length 0; when new cigar element is retrieved,
|
||||
// we reenter in order to re-check cigarElementCounter against curElement's length
|
||||
return stepForwardOnGenome();
|
||||
} else {
|
||||
if (curElement != null && curElement.getOperator() == CigarOperator.D)
|
||||
throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString());
|
||||
|
||||
// Reads that contain indels model the genomeOffset as the following base in the reference. Because
|
||||
// we fall into this else block only when indels end the read, increment genomeOffset such that the
|
||||
// current offset of this read is the next ref base after the end of the indel. This position will
|
||||
// model a point on the reference somewhere after the end of the read.
|
||||
genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here:
|
||||
// we do step forward on the ref, and by returning null we also indicate that we are past the read end.
|
||||
// we do step forward on the ref, and by returning null we also indicate that we are past the read end.
|
||||
|
||||
if ( generateExtendedEvents && eventDelayedFlag > 0 ) {
|
||||
if (generateExtendedEvents && eventDelayedFlag > 0) {
|
||||
|
||||
// if we had an indel right before the read ended (i.e. insertion was the last cigar element),
|
||||
// we keep it until next reference base; then we discard it and this will allow the LocusIterator to
|
||||
// finally discard this read
|
||||
eventDelayedFlag--;
|
||||
if ( eventDelayedFlag == 0 ) {
|
||||
if (eventDelayedFlag == 0) {
|
||||
eventLength = -1; // reset event when we are past it
|
||||
insertedBases = null;
|
||||
eventStart = -1;
|
||||
|
|
@ -193,34 +220,37 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
boolean done = false;
|
||||
switch (curElement.getOperator()) {
|
||||
case H : // ignore hard clips
|
||||
case P : // ignore pads
|
||||
case H: // ignore hard clips
|
||||
case P: // ignore pads
|
||||
cigarElementCounter = curElement.getLength();
|
||||
break;
|
||||
case I : // insertion w.r.t. the reference
|
||||
if ( generateExtendedEvents ) {
|
||||
case I: // insertion w.r.t. the reference
|
||||
if (generateExtendedEvents) {
|
||||
// we see insertions only once, when we step right onto them; the position on the read is scrolled
|
||||
// past the insertion right after that
|
||||
if ( eventDelayedFlag > 1 ) throw new UserException.MalformedBAM(read, "Adjacent I/D events in read "+read.getReadName());
|
||||
insertedBases = Arrays.copyOfRange(read.getReadBases(),readOffset+1,readOffset+1+curElement.getLength());
|
||||
eventLength = curElement.getLength() ;
|
||||
if (eventDelayedFlag > 1)
|
||||
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString()));
|
||||
insertedBases = Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + curElement.getLength());
|
||||
eventLength = curElement.getLength();
|
||||
eventStart = readOffset;
|
||||
eventDelayedFlag = 2; // insertion causes re-entry into stepForwardOnGenome, so we set the delay to 2
|
||||
// System.out.println("Inserted "+(new String (insertedBases)) +" after "+readOffset);
|
||||
} // continue onto the 'S' case !
|
||||
case S : // soft clip
|
||||
case S: // soft clip
|
||||
cigarElementCounter = curElement.getLength();
|
||||
readOffset += curElement.getLength();
|
||||
break;
|
||||
case D : // deletion w.r.t. the reference
|
||||
if ( generateExtendedEvents ) {
|
||||
if ( cigarElementCounter == 1) {
|
||||
case D: // deletion w.r.t. the reference
|
||||
if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
|
||||
throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString());
|
||||
if (generateExtendedEvents) {
|
||||
if (cigarElementCounter == 1) {
|
||||
// generate an extended event only if we just stepped into the deletion (i.e. don't
|
||||
// generate the event at every deleted position on the ref, that's what cigarElementCounter==1 is for!)
|
||||
if ( eventDelayedFlag > 1 ) throw new UserException.MalformedBAM(read, "Adjacent I/D events in read "+read.getReadName());
|
||||
if (eventDelayedFlag > 1)
|
||||
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString()));
|
||||
eventLength = curElement.getLength();
|
||||
eventDelayedFlag = 2; // deletion on the ref causes an immediate return, so we have to delay by 1 only
|
||||
eventStart = readOffset;
|
||||
|
|
@ -232,26 +262,27 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
genomeOffset++;
|
||||
done = true;
|
||||
break;
|
||||
case N : // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
|
||||
case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
|
||||
genomeOffset++;
|
||||
done = true;
|
||||
break;
|
||||
case M :
|
||||
case M:
|
||||
readOffset++;
|
||||
genomeOffset++;
|
||||
done = true;
|
||||
break;
|
||||
default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator());
|
||||
default:
|
||||
throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator());
|
||||
}
|
||||
|
||||
if ( generateExtendedEvents ) {
|
||||
if ( eventDelayedFlag > 0 && done ) {
|
||||
// if we did make a successful step on the ref, decrement delayed flag. If, upon the decrementthe,
|
||||
if (generateExtendedEvents) {
|
||||
if (eventDelayedFlag > 0 && done) {
|
||||
// if we did make a successful step on the ref, decrement delayed flag. If, upon the decrementing the,
|
||||
// the flag is 1, we are standing on the reference base right after the indel (so we have to keep it).
|
||||
// Otherwise, we are away from the previous indel and have to clear our memories...
|
||||
eventDelayedFlag--; // when we notice an indel, we set delayed flag to 2, so now
|
||||
// if eventDelayedFlag == 1, an indel occured right before the current base
|
||||
if ( eventDelayedFlag == 0 ) {
|
||||
// if eventDelayedFlag == 1, an indel occured right before the current base
|
||||
if (eventDelayedFlag == 0) {
|
||||
eventLength = -1; // reset event when we are past it
|
||||
insertedBases = null;
|
||||
eventStart = -1;
|
||||
|
|
@ -274,15 +305,15 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public LocusIteratorByState(final Iterator<SAMRecord> samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection<String> samples ) {
|
||||
public LocusIteratorByState(final Iterator<SAMRecord> samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection<String> samples) {
|
||||
this.readInfo = readInformation;
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
this.samples = new ArrayList<String>(samples);
|
||||
this.readStates = new ReadStateManager(samIterator,readInformation.getDownsamplingMethod());
|
||||
this.readStates = new ReadStateManager(samIterator, readInformation.getDownsamplingMethod());
|
||||
|
||||
// currently the GATK expects this LocusIteratorByState to accept empty sample lists, when
|
||||
// there's no read data. So we need to throw this error only when samIterator.hasNext() is true
|
||||
if ( this.samples.isEmpty() && samIterator.hasNext() ) {
|
||||
if (this.samples.isEmpty() && samIterator.hasNext()) {
|
||||
throw new IllegalArgumentException("samples list must not be empty");
|
||||
}
|
||||
}
|
||||
|
|
@ -322,7 +353,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
// -----------------------------------------------------------------------------------------------------------------
|
||||
public AlignmentContext next() {
|
||||
lazyLoadNextAlignmentContext();
|
||||
if(!hasNext())
|
||||
if (!hasNext())
|
||||
throw new NoSuchElementException("LocusIteratorByState: out of elements.");
|
||||
AlignmentContext currentAlignmentContext = nextAlignmentContext;
|
||||
nextAlignmentContext = null;
|
||||
|
|
@ -334,7 +365,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
* nextAlignmentContext MUST BE null in order for this method to advance to the next entry.
|
||||
*/
|
||||
private void lazyLoadNextAlignmentContext() {
|
||||
while(nextAlignmentContext == null && readStates.hasNext()) {
|
||||
while (nextAlignmentContext == null && readStates.hasNext()) {
|
||||
// this call will set hasExtendedEvents to true if it picks up a read with indel right before the current position on the ref:
|
||||
readStates.collectPendingReads();
|
||||
|
||||
|
|
@ -350,14 +381,14 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
// In this case, the subsequent call to next() will emit the normal pileup at the current base
|
||||
// and shift the position.
|
||||
if (readInfo.generateExtendedEvents() && hasExtendedEvents) {
|
||||
Map<String,ReadBackedExtendedEventPileupImpl> fullExtendedEventPileup = new HashMap<String,ReadBackedExtendedEventPileupImpl>();
|
||||
Map<String, ReadBackedExtendedEventPileupImpl> fullExtendedEventPileup = new HashMap<String, ReadBackedExtendedEventPileupImpl>();
|
||||
|
||||
// get current location on the reference and decrement it by 1: the indels we just stepped over
|
||||
// are associated with the *previous* reference base
|
||||
GenomeLoc loc = genomeLocParser.incPos(getLocation(),-1);
|
||||
GenomeLoc loc = genomeLocParser.incPos(getLocation(), -1);
|
||||
|
||||
boolean hasBeenSampled = false;
|
||||
for(final String sample: samples) {
|
||||
for (final String sample : samples) {
|
||||
Iterator<SAMRecordState> iterator = readStates.iterator(sample);
|
||||
List<ExtendedEventPileupElement> indelPile = new ArrayList<ExtendedEventPileupElement>(readStates.size(sample));
|
||||
hasBeenSampled |= loc.getStart() <= readStates.getDownsamplingExtent(sample);
|
||||
|
|
@ -368,103 +399,117 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
nMQ0Reads = 0;
|
||||
int maxDeletionLength = 0;
|
||||
|
||||
while(iterator.hasNext()) {
|
||||
SAMRecordState state = iterator.next();
|
||||
if ( state.hadIndel() ) {
|
||||
while (iterator.hasNext()) {
|
||||
final SAMRecordState state = iterator.next();
|
||||
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
|
||||
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
|
||||
final int readOffset = state.getReadOffset(); // the base offset on this read
|
||||
final int eventStartOffset = state.getReadEventStartOffset(); // this will be -1 if base is not a deletion, or if base is the first deletion in the event. Otherwise, it will give the last base before the deletion began.
|
||||
final int eventLength = state.getEventLength();
|
||||
|
||||
if (op == CigarOperator.N) // N's are never added to any pileup
|
||||
continue;
|
||||
|
||||
if (state.hadIndel()) { // this read has an indel associated with the previous position on the ref
|
||||
size++;
|
||||
if ( state.getEventBases() == null ) {
|
||||
ExtendedEventPileupElement pileupElement;
|
||||
if (state.getEventBases() == null) { // Deletion event
|
||||
nDeletions++;
|
||||
maxDeletionLength = Math.max(maxDeletionLength,state.getEventLength());
|
||||
maxDeletionLength = Math.max(maxDeletionLength, state.getEventLength());
|
||||
pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength);
|
||||
}
|
||||
else nInsertions++;
|
||||
indelPile.add ( new ExtendedEventPileupElement((GATKSAMRecord) state.getRead(), state.getReadEventStartOffset(), state.getEventLength(), state.getEventBases()) );
|
||||
|
||||
} else {
|
||||
// HACK: The readahead mechanism for LocusIteratorByState will effectively read past the current position
|
||||
// and add in extra reads that start after this indel. Skip these reads.
|
||||
// My belief at this moment after empirically looking at read->ref alignment is that, in a cigar string
|
||||
// like 1I76M, the first insertion is between alignment start-1 and alignment start, so we shouldn't be
|
||||
// filtering these out.
|
||||
// TODO: UPDATE! Eric tells me that we *might* want reads adjacent to the pileup in the pileup. Strike this block.
|
||||
//if(state.getRead().getAlignmentStart() > loc.getStart())
|
||||
// continue;
|
||||
|
||||
if ( state.getCurrentCigarOperator() != CigarOperator.N ) {
|
||||
// this read has no indel associated with the previous position on the ref;
|
||||
// we count this read in only if it has actual bases, not N span...
|
||||
if ( state.getCurrentCigarOperator() != CigarOperator.D || readInfo.includeReadsWithDeletionAtLoci() ) {
|
||||
|
||||
// if cigar operator is D but the read has no extended event reported (that's why we ended
|
||||
// up in this branch), it means that we are currently inside a deletion that started earlier;
|
||||
// we count such reads (with a longer deletion spanning over a deletion at the previous base we are
|
||||
// about to report) only if includeReadsWithDeletionAtLoci is true.
|
||||
size++;
|
||||
indelPile.add ( new ExtendedEventPileupElement((GATKSAMRecord) state.getRead(), state.getReadOffset()-1, -1) // length=-1 --> noevent
|
||||
);
|
||||
}
|
||||
else { // Insertion event
|
||||
nInsertions++;
|
||||
pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength, state.getEventBases());
|
||||
}
|
||||
if (read.getMappingQuality() == 0)
|
||||
nMQ0Reads++;
|
||||
|
||||
indelPile.add(pileupElement);
|
||||
}
|
||||
if ( state.getRead().getMappingQuality() == 0 ) {
|
||||
nMQ0Reads++;
|
||||
|
||||
// this read has no indel so add it to the pileup as a NOEVENT:
|
||||
// a deletion that didn't start here (therefore, not an extended event)
|
||||
// we add (mis)matches as no events.
|
||||
else if (op != CigarOperator.D || readInfo.includeReadsWithDeletionAtLoci()) {
|
||||
size++;
|
||||
indelPile.add(new ExtendedEventPileupElement((GATKSAMRecord) state.getRead(), readOffset));
|
||||
if (read.getMappingQuality() == 0)
|
||||
nMQ0Reads++;
|
||||
}
|
||||
}
|
||||
if( indelPile.size() != 0 ) fullExtendedEventPileup.put(sample,new ReadBackedExtendedEventPileupImpl(loc,indelPile,size,maxDeletionLength,nInsertions,nDeletions,nMQ0Reads));
|
||||
}
|
||||
hasExtendedEvents = false; // we are done with extended events prior to current ref base
|
||||
// System.out.println("Indel(s) at "+loc);
|
||||
// for ( ExtendedEventPileupElement pe : indelPile ) { if ( pe.isIndel() ) System.out.println(" "+pe.toString()); }
|
||||
nextAlignmentContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileupImpl(loc, fullExtendedEventPileup), hasBeenSampled);
|
||||
} else {
|
||||
GenomeLoc location = getLocation();
|
||||
Map<String,ReadBackedPileupImpl> fullPileup = new HashMap<String,ReadBackedPileupImpl>();
|
||||
|
||||
if (indelPile.size() != 0)
|
||||
fullExtendedEventPileup.put(sample, new ReadBackedExtendedEventPileupImpl(loc, indelPile, size, maxDeletionLength, nInsertions, nDeletions, nMQ0Reads));
|
||||
}
|
||||
hasExtendedEvents = false; // we are done with extended events prior to current ref base
|
||||
nextAlignmentContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileupImpl(loc, fullExtendedEventPileup), hasBeenSampled);
|
||||
}
|
||||
else { // this is a regular event pileup (not extended)
|
||||
GenomeLoc location = getLocation();
|
||||
Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
|
||||
boolean hasBeenSampled = false;
|
||||
for(final String sample: samples) {
|
||||
for (final String sample : samples) {
|
||||
Iterator<SAMRecordState> iterator = readStates.iterator(sample);
|
||||
List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
|
||||
hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample);
|
||||
|
||||
size = 0;
|
||||
nDeletions = 0;
|
||||
nMQ0Reads = 0;
|
||||
size = 0; // number of elements in this sample's pileup
|
||||
nDeletions = 0; // number of deletions in this sample's pileup
|
||||
nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0)
|
||||
|
||||
while(iterator.hasNext()) {
|
||||
SAMRecordState state = iterator.next();
|
||||
if ( state.getCurrentCigarOperator() != CigarOperator.D && state.getCurrentCigarOperator() != CigarOperator.N ) {
|
||||
if ( filterBaseInRead((GATKSAMRecord) state.getRead(), location.getStart()) ) {
|
||||
//discarded_bases++;
|
||||
//printStatus("Adaptor bases", discarded_adaptor_bases);
|
||||
continue;
|
||||
} else {
|
||||
//observed_bases++;
|
||||
pile.add(new PileupElement((GATKSAMRecord) state.getRead(), state.getReadOffset()));
|
||||
while (iterator.hasNext()) {
|
||||
final SAMRecordState state = iterator.next(); // state object with the read/offset information
|
||||
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
|
||||
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
|
||||
final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element
|
||||
final CigarOperator nextOp = nextElement.getOperator();
|
||||
final int readOffset = state.getReadOffset(); // the base offset on this read
|
||||
|
||||
int nextElementLength = nextElement.getLength();
|
||||
|
||||
if (op == CigarOperator.N) // N's are never added to any pileup
|
||||
continue;
|
||||
|
||||
if (op == CigarOperator.D) {
|
||||
if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
|
||||
pile.add(new PileupElement(read, readOffset, true, nextOp == CigarOperator.D, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()),
|
||||
null,nextOp == CigarOperator.D? nextElementLength:-1));
|
||||
size++;
|
||||
nDeletions++;
|
||||
if (read.getMappingQuality() == 0)
|
||||
nMQ0Reads++;
|
||||
}
|
||||
} else if ( readInfo.includeReadsWithDeletionAtLoci() && state.getCurrentCigarOperator() != CigarOperator.N ) {
|
||||
size++;
|
||||
pile.add(new PileupElement((GATKSAMRecord) state.getRead(), -1));
|
||||
nDeletions++;
|
||||
}
|
||||
|
||||
if ( state.getRead().getMappingQuality() == 0 ) {
|
||||
nMQ0Reads++;
|
||||
else {
|
||||
if (!filterBaseInRead(read, location.getStart())) {
|
||||
String insertedBaseString = null;
|
||||
if (nextOp == CigarOperator.I) {
|
||||
insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + nextElement.getLength()));
|
||||
}
|
||||
pile.add(new PileupElement(read, readOffset, false, nextOp == CigarOperator.D, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()),
|
||||
insertedBaseString,nextElementLength));
|
||||
size++;
|
||||
if (read.getMappingQuality() == 0)
|
||||
nMQ0Reads++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( pile.size() != 0 )
|
||||
fullPileup.put(sample,new ReadBackedPileupImpl(location,pile,size,nDeletions,nMQ0Reads));
|
||||
if (pile.size() != 0) // if this pileup added at least one base, add it to the full pileup
|
||||
fullPileup.put(sample, new ReadBackedPileupImpl(location, pile, size, nDeletions, nMQ0Reads));
|
||||
}
|
||||
|
||||
updateReadStates(); // critical - must be called after we get the current state offsets and location
|
||||
// if we got reads with non-D/N over the current position, we are done
|
||||
if ( !fullPileup.isEmpty() ) nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location,fullPileup),hasBeenSampled);
|
||||
updateReadStates(); // critical - must be called after we get the current state offsets and location
|
||||
if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done
|
||||
nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fast testing of position
|
||||
private boolean readIsPastCurrentPosition(SAMRecord read) {
|
||||
if ( readStates.isEmpty() )
|
||||
if (readStates.isEmpty())
|
||||
return false;
|
||||
else {
|
||||
SAMRecordState state = readStates.getFirst();
|
||||
|
|
@ -485,20 +530,18 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
}
|
||||
|
||||
private void updateReadStates() {
|
||||
for(final String sample: samples) {
|
||||
for (final String sample : samples) {
|
||||
Iterator<SAMRecordState> it = readStates.iterator(sample);
|
||||
while ( it.hasNext() ) {
|
||||
while (it.hasNext()) {
|
||||
SAMRecordState state = it.next();
|
||||
CigarOperator op = state.stepForwardOnGenome();
|
||||
if ( state.hadIndel() && readInfo.generateExtendedEvents() ) hasExtendedEvents = true;
|
||||
else {
|
||||
if (state.hadIndel() && readInfo.generateExtendedEvents())
|
||||
hasExtendedEvents = true;
|
||||
else if (op == null) {
|
||||
// we discard the read only when we are past its end AND indel at the end of the read (if any) was
|
||||
// already processed. Keeping the read state that retunred null upon stepForwardOnGenome() is safe
|
||||
// as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
|
||||
if ( op == null ) { // we've stepped off the end of the object
|
||||
//if (DEBUG) logger.debug(String.format(" removing read %s at %d", state.getRead().getReadName(), state.getRead().getAlignmentStart()));
|
||||
it.remove();
|
||||
}
|
||||
it.remove(); // we've stepped off the end of the object
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -508,20 +551,20 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
|
||||
}
|
||||
|
||||
private class ReadStateManager {
|
||||
private class ReadStateManager {
|
||||
private final PeekableIterator<SAMRecord> iterator;
|
||||
private final DownsamplingMethod downsamplingMethod;
|
||||
private final SamplePartitioner samplePartitioner;
|
||||
private final Map<String,PerSampleReadStateManager> readStatesBySample = new HashMap<String,PerSampleReadStateManager>();
|
||||
private final Map<String, PerSampleReadStateManager> readStatesBySample = new HashMap<String, PerSampleReadStateManager>();
|
||||
private final int targetCoverage;
|
||||
private int totalReadStates = 0;
|
||||
|
||||
public ReadStateManager(Iterator<SAMRecord> source, DownsamplingMethod downsamplingMethod) {
|
||||
this.iterator = new PeekableIterator<SAMRecord>(source);
|
||||
this.downsamplingMethod = downsamplingMethod.type != null ? downsamplingMethod : DownsamplingMethod.NONE;
|
||||
switch(this.downsamplingMethod.type) {
|
||||
switch (this.downsamplingMethod.type) {
|
||||
case BY_SAMPLE:
|
||||
if(downsamplingMethod.toCoverage == null)
|
||||
if (downsamplingMethod.toCoverage == null)
|
||||
throw new UserException.BadArgumentValue("dcov", "Downsampling coverage (-dcov) must be specified when downsampling by sample");
|
||||
this.targetCoverage = downsamplingMethod.toCoverage;
|
||||
break;
|
||||
|
|
@ -529,10 +572,10 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
this.targetCoverage = Integer.MAX_VALUE;
|
||||
}
|
||||
|
||||
Map<String,ReadSelector> readSelectors = new HashMap<String,ReadSelector>();
|
||||
for(final String sample: samples) {
|
||||
readStatesBySample.put(sample,new PerSampleReadStateManager());
|
||||
readSelectors.put(sample,downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null,targetCoverage) : new AllReadsSelector());
|
||||
Map<String, ReadSelector> readSelectors = new HashMap<String, ReadSelector>();
|
||||
for (final String sample : samples) {
|
||||
readStatesBySample.put(sample, new PerSampleReadStateManager());
|
||||
readSelectors.put(sample, downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null, targetCoverage) : new AllReadsSelector());
|
||||
}
|
||||
|
||||
samplePartitioner = new SamplePartitioner(readSelectors);
|
||||
|
|
@ -541,6 +584,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
/**
|
||||
* Returns a iterator over all the reads associated with the given sample. Note that remove() is implemented
|
||||
* for this iterator; if present, total read states will be decremented.
|
||||
*
|
||||
* @param sample The sample.
|
||||
* @return Iterator over the reads associated with that sample.
|
||||
*/
|
||||
|
|
@ -569,6 +613,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
|
||||
/**
|
||||
* Retrieves the total number of reads in the manager across all samples.
|
||||
*
|
||||
* @return Total number of reads over all samples.
|
||||
*/
|
||||
public int size() {
|
||||
|
|
@ -577,6 +622,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
|
||||
/**
|
||||
* Retrieves the total number of reads in the manager in the given sample.
|
||||
*
|
||||
* @param sample The sample.
|
||||
* @return Total number of reads in the given sample.
|
||||
*/
|
||||
|
|
@ -587,6 +633,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
/**
|
||||
* The extent of downsampling; basically, the furthest base out which has 'fallen
|
||||
* victim' to the downsampler.
|
||||
*
|
||||
* @param sample Sample, downsampled independently.
|
||||
* @return Integer stop of the furthest undownsampled region.
|
||||
*/
|
||||
|
|
@ -595,9 +642,9 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
}
|
||||
|
||||
public SAMRecordState getFirst() {
|
||||
for(final String sample: samples) {
|
||||
for (final String sample : samples) {
|
||||
PerSampleReadStateManager reads = readStatesBySample.get(sample);
|
||||
if(!reads.isEmpty())
|
||||
if (!reads.isEmpty())
|
||||
return reads.peek();
|
||||
}
|
||||
return null;
|
||||
|
|
@ -608,19 +655,18 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
}
|
||||
|
||||
public void collectPendingReads() {
|
||||
if(!iterator.hasNext())
|
||||
if (!iterator.hasNext())
|
||||
return;
|
||||
|
||||
if(readStates.size() == 0) {
|
||||
if (readStates.size() == 0) {
|
||||
int firstContigIndex = iterator.peek().getReferenceIndex();
|
||||
int firstAlignmentStart = iterator.peek().getAlignmentStart();
|
||||
while(iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) {
|
||||
while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) {
|
||||
samplePartitioner.submitRead(iterator.next());
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
// Fast fail in the case that the read is past the current position.
|
||||
if(readIsPastCurrentPosition(iterator.peek()))
|
||||
if (readIsPastCurrentPosition(iterator.peek()))
|
||||
return;
|
||||
|
||||
while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) {
|
||||
|
|
@ -629,7 +675,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
}
|
||||
samplePartitioner.complete();
|
||||
|
||||
for(final String sample: samples) {
|
||||
for (final String sample : samples) {
|
||||
ReadSelector aggregator = samplePartitioner.getSelectedReads(sample);
|
||||
|
||||
Collection<SAMRecord> newReads = new ArrayList<SAMRecord>(aggregator.getSelectedReads());
|
||||
|
|
@ -638,21 +684,20 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
int numReads = statesBySample.size();
|
||||
int downsamplingExtent = aggregator.getDownsamplingExtent();
|
||||
|
||||
if(numReads+newReads.size()<=targetCoverage || downsamplingMethod.type==DownsampleType.NONE) {
|
||||
if (numReads + newReads.size() <= targetCoverage || downsamplingMethod.type == DownsampleType.NONE) {
|
||||
long readLimit = aggregator.getNumReadsSeen();
|
||||
addReadsToSample(statesBySample,newReads,readLimit);
|
||||
addReadsToSample(statesBySample, newReads, readLimit);
|
||||
statesBySample.specifyNewDownsamplingExtent(downsamplingExtent);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
int[] counts = statesBySample.getCountsPerAlignmentStart();
|
||||
int[] updatedCounts = new int[counts.length];
|
||||
System.arraycopy(counts,0,updatedCounts,0,counts.length);
|
||||
System.arraycopy(counts, 0, updatedCounts, 0, counts.length);
|
||||
|
||||
boolean readPruned = true;
|
||||
while(numReads+newReads.size()>targetCoverage && readPruned) {
|
||||
while (numReads + newReads.size() > targetCoverage && readPruned) {
|
||||
readPruned = false;
|
||||
for(int alignmentStart=updatedCounts.length-1;numReads+newReads.size()>targetCoverage&&alignmentStart>=0;alignmentStart--) {
|
||||
if(updatedCounts[alignmentStart] > 1) {
|
||||
for (int alignmentStart = updatedCounts.length - 1; numReads + newReads.size() > targetCoverage && alignmentStart >= 0; alignmentStart--) {
|
||||
if (updatedCounts[alignmentStart] > 1) {
|
||||
updatedCounts[alignmentStart]--;
|
||||
numReads--;
|
||||
readPruned = true;
|
||||
|
|
@ -660,7 +705,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
}
|
||||
}
|
||||
|
||||
if(numReads == targetCoverage) {
|
||||
if (numReads == targetCoverage) {
|
||||
updatedCounts[0]--;
|
||||
numReads--;
|
||||
}
|
||||
|
|
@ -668,18 +713,18 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
BitSet toPurge = new BitSet(readStates.size());
|
||||
int readOffset = 0;
|
||||
|
||||
for(int i = 0; i < updatedCounts.length; i++) {
|
||||
for (int i = 0; i < updatedCounts.length; i++) {
|
||||
int n = counts[i];
|
||||
int k = updatedCounts[i];
|
||||
|
||||
for(Integer purgedElement: MathUtils.sampleIndicesWithoutReplacement(n,n-k))
|
||||
toPurge.set(readOffset+purgedElement);
|
||||
for (Integer purgedElement : MathUtils.sampleIndicesWithoutReplacement(n, n - k))
|
||||
toPurge.set(readOffset + purgedElement);
|
||||
|
||||
readOffset += counts[i];
|
||||
}
|
||||
downsamplingExtent = Math.max(downsamplingExtent,statesBySample.purge(toPurge));
|
||||
|
||||
addReadsToSample(statesBySample,newReads,targetCoverage-numReads);
|
||||
downsamplingExtent = Math.max(downsamplingExtent, statesBySample.purge(toPurge));
|
||||
|
||||
addReadsToSample(statesBySample, newReads, targetCoverage - numReads);
|
||||
statesBySample.specifyNewDownsamplingExtent(downsamplingExtent);
|
||||
}
|
||||
}
|
||||
|
|
@ -688,23 +733,25 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
|
||||
/**
|
||||
* Add reads with the given sample name to the given hanger entry.
|
||||
*
|
||||
* @param readStates The list of read states to add this collection of reads.
|
||||
* @param reads Reads to add. Selected reads will be pulled from this source.
|
||||
* @param maxReads Maximum number of reads to add.
|
||||
* @param reads Reads to add. Selected reads will be pulled from this source.
|
||||
* @param maxReads Maximum number of reads to add.
|
||||
*/
|
||||
private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection<SAMRecord> reads, final long maxReads) {
|
||||
if(reads.isEmpty())
|
||||
if (reads.isEmpty())
|
||||
return;
|
||||
|
||||
Collection<SAMRecordState> newReadStates = new LinkedList<SAMRecordState>();
|
||||
int readCount = 0;
|
||||
for(SAMRecord read: reads) {
|
||||
if(readCount < maxReads) {
|
||||
for (SAMRecord read : reads) {
|
||||
if (readCount < maxReads) {
|
||||
SAMRecordState state = new SAMRecordState(read, readInfo.generateExtendedEvents());
|
||||
state.stepForwardOnGenome();
|
||||
newReadStates.add(state);
|
||||
// TODO: What if we downsample the extended events away?
|
||||
if (state.hadIndel()) hasExtendedEvents = true;
|
||||
if (state.hadIndel())
|
||||
hasExtendedEvents = true;
|
||||
readCount++;
|
||||
}
|
||||
}
|
||||
|
|
@ -735,7 +782,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
}
|
||||
|
||||
public void specifyNewDownsamplingExtent(int downsamplingExtent) {
|
||||
this.downsamplingExtent = Math.max(this.downsamplingExtent,downsamplingExtent);
|
||||
this.downsamplingExtent = Math.max(this.downsamplingExtent, downsamplingExtent);
|
||||
}
|
||||
|
||||
public int getDownsamplingExtent() {
|
||||
|
|
@ -745,7 +792,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
public int[] getCountsPerAlignmentStart() {
|
||||
int[] counts = new int[readStateCounter.size()];
|
||||
int index = 0;
|
||||
for(Counter counter: readStateCounter)
|
||||
for (Counter counter : readStateCounter)
|
||||
counts[index++] = counter.getCount();
|
||||
return counts;
|
||||
}
|
||||
|
|
@ -766,7 +813,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
wrappedIterator.remove();
|
||||
Counter counter = readStateCounter.peek();
|
||||
counter.decrement();
|
||||
if(counter.getCount() == 0)
|
||||
if (counter.getCount() == 0)
|
||||
readStateCounter.remove();
|
||||
}
|
||||
};
|
||||
|
|
@ -775,13 +822,14 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
/**
|
||||
* Purge the given elements from the bitset. If an element in the bitset is true, purge
|
||||
* the corresponding read state.
|
||||
*
|
||||
* @param elements bits from the set to purge.
|
||||
* @return the extent of the final downsampled read.
|
||||
*/
|
||||
public int purge(final BitSet elements) {
|
||||
int downsamplingExtent = 0;
|
||||
|
||||
if(elements.isEmpty() || readStates.isEmpty()) return downsamplingExtent;
|
||||
if (elements.isEmpty() || readStates.isEmpty()) return downsamplingExtent;
|
||||
|
||||
Iterator<SAMRecordState> readStateIterator = readStates.iterator();
|
||||
|
||||
|
|
@ -794,22 +842,22 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
int toPurge = elements.nextSetBit(0);
|
||||
int removedCount = 0;
|
||||
|
||||
while(readStateIterator.hasNext() && toPurge >= 0) {
|
||||
while (readStateIterator.hasNext() && toPurge >= 0) {
|
||||
SAMRecordState state = readStateIterator.next();
|
||||
downsamplingExtent = Math.max(downsamplingExtent,state.getRead().getAlignmentEnd());
|
||||
downsamplingExtent = Math.max(downsamplingExtent, state.getRead().getAlignmentEnd());
|
||||
|
||||
if(readIndex == toPurge) {
|
||||
if (readIndex == toPurge) {
|
||||
readStateIterator.remove();
|
||||
currentCounter.decrement();
|
||||
if(currentCounter.getCount() == 0)
|
||||
if (currentCounter.getCount() == 0)
|
||||
counterIterator.remove();
|
||||
removedCount++;
|
||||
toPurge = elements.nextSetBit(toPurge+1);
|
||||
toPurge = elements.nextSetBit(toPurge + 1);
|
||||
}
|
||||
|
||||
readIndex++;
|
||||
alignmentStartCounter--;
|
||||
if(alignmentStartCounter == 0 && counterIterator.hasNext()) {
|
||||
if (alignmentStartCounter == 0 && counterIterator.hasNext()) {
|
||||
currentCounter = counterIterator.next();
|
||||
alignmentStartCounter = currentCounter.getCount();
|
||||
}
|
||||
|
|
@ -849,12 +897,14 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
interface ReadSelector {
|
||||
/**
|
||||
* All previous selectors in the chain have allowed this read. Submit it to this selector for consideration.
|
||||
*
|
||||
* @param read the read to evaluate.
|
||||
*/
|
||||
public void submitRead(SAMRecord read);
|
||||
|
||||
/**
|
||||
* A previous selector has deemed this read unfit. Notify this selector so that this selector's counts are valid.
|
||||
*
|
||||
* @param read the read previously rejected.
|
||||
*/
|
||||
public void notifyReadRejected(SAMRecord read);
|
||||
|
|
@ -866,12 +916,14 @@ interface ReadSelector {
|
|||
|
||||
/**
|
||||
* Retrieve the number of reads seen by this selector so far.
|
||||
*
|
||||
* @return number of reads seen.
|
||||
*/
|
||||
public long getNumReadsSeen();
|
||||
|
||||
/**
|
||||
* Return the number of reads accepted by this selector so far.
|
||||
*
|
||||
* @return number of reads selected.
|
||||
*/
|
||||
public long getNumReadsSelected();
|
||||
|
|
@ -880,12 +932,14 @@ interface ReadSelector {
|
|||
* Gets the locus at which the last of the downsampled reads selected by this selector ends. The value returned will be the
|
||||
* last aligned position from this selection to which a downsampled read aligns -- in other words, if a read is thrown out at
|
||||
* position 3 whose cigar string is 76M, the value of this parameter will be 78.
|
||||
*
|
||||
* @return If any read has been downsampled, this will return the last aligned base of the longest alignment. Else, 0.
|
||||
*/
|
||||
public int getDownsamplingExtent();
|
||||
|
||||
/**
|
||||
* Get the reads selected by this selector.
|
||||
*
|
||||
* @return collection of reads selected by this selector.
|
||||
*/
|
||||
public Collection<SAMRecord> getSelectedReads();
|
||||
|
|
@ -911,7 +965,7 @@ class AllReadsSelector implements ReadSelector {
|
|||
|
||||
public void notifyReadRejected(SAMRecord read) {
|
||||
readsSeen++;
|
||||
downsamplingExtent = Math.max(downsamplingExtent,read.getAlignmentEnd());
|
||||
downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd());
|
||||
}
|
||||
|
||||
public void complete() {
|
||||
|
|
@ -949,18 +1003,18 @@ class NRandomReadSelector implements ReadSelector {
|
|||
private final ReservoirDownsampler<SAMRecord> reservoir;
|
||||
private final ReadSelector chainedSelector;
|
||||
private long readsSeen = 0;
|
||||
private int downsamplingExtent = 0;
|
||||
private int downsamplingExtent = 0;
|
||||
|
||||
public NRandomReadSelector(ReadSelector chainedSelector, long readLimit) {
|
||||
this.reservoir = new ReservoirDownsampler<SAMRecord>((int)readLimit);
|
||||
this.reservoir = new ReservoirDownsampler<SAMRecord>((int) readLimit);
|
||||
this.chainedSelector = chainedSelector;
|
||||
}
|
||||
|
||||
public void submitRead(SAMRecord read) {
|
||||
SAMRecord displaced = reservoir.add(read);
|
||||
if(displaced != null && chainedSelector != null) {
|
||||
if (displaced != null && chainedSelector != null) {
|
||||
chainedSelector.notifyReadRejected(read);
|
||||
downsamplingExtent = Math.max(downsamplingExtent,read.getAlignmentEnd());
|
||||
downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd());
|
||||
}
|
||||
readsSeen++;
|
||||
}
|
||||
|
|
@ -970,9 +1024,9 @@ class NRandomReadSelector implements ReadSelector {
|
|||
}
|
||||
|
||||
public void complete() {
|
||||
for(SAMRecord read: reservoir.getDownsampledContents())
|
||||
for (SAMRecord read : reservoir.getDownsampledContents())
|
||||
chainedSelector.submitRead(read);
|
||||
if(chainedSelector != null)
|
||||
if (chainedSelector != null)
|
||||
chainedSelector.complete();
|
||||
}
|
||||
|
||||
|
|
@ -987,7 +1041,7 @@ class NRandomReadSelector implements ReadSelector {
|
|||
|
||||
public int getDownsamplingExtent() {
|
||||
return downsamplingExtent;
|
||||
}
|
||||
}
|
||||
|
||||
public Collection<SAMRecord> getSelectedReads() {
|
||||
return reservoir.getDownsampledContents();
|
||||
|
|
@ -996,7 +1050,7 @@ class NRandomReadSelector implements ReadSelector {
|
|||
public void reset() {
|
||||
reservoir.clear();
|
||||
downsamplingExtent = 0;
|
||||
if(chainedSelector != null)
|
||||
if (chainedSelector != null)
|
||||
chainedSelector.reset();
|
||||
}
|
||||
}
|
||||
|
|
@ -1005,23 +1059,23 @@ class NRandomReadSelector implements ReadSelector {
|
|||
* Note: stores reads by sample ID string, not by sample object
|
||||
*/
|
||||
class SamplePartitioner implements ReadSelector {
|
||||
private final Map<String,ReadSelector> readsBySample;
|
||||
private final Map<String, ReadSelector> readsBySample;
|
||||
private long readsSeen = 0;
|
||||
|
||||
public SamplePartitioner(Map<String,ReadSelector> readSelectors) {
|
||||
public SamplePartitioner(Map<String, ReadSelector> readSelectors) {
|
||||
readsBySample = readSelectors;
|
||||
}
|
||||
|
||||
public void submitRead(SAMRecord read) {
|
||||
String sampleName = read.getReadGroup()!=null ? read.getReadGroup().getSample() : null;
|
||||
if(readsBySample.containsKey(sampleName))
|
||||
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
|
||||
if (readsBySample.containsKey(sampleName))
|
||||
readsBySample.get(sampleName).submitRead(read);
|
||||
readsSeen++;
|
||||
}
|
||||
|
||||
public void notifyReadRejected(SAMRecord read) {
|
||||
String sampleName = read.getReadGroup()!=null ? read.getReadGroup().getSample() : null;
|
||||
if(readsBySample.containsKey(sampleName))
|
||||
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
|
||||
if (readsBySample.containsKey(sampleName))
|
||||
readsBySample.get(sampleName).notifyReadRejected(read);
|
||||
readsSeen++;
|
||||
}
|
||||
|
|
@ -1040,23 +1094,23 @@ class SamplePartitioner implements ReadSelector {
|
|||
|
||||
public int getDownsamplingExtent() {
|
||||
int downsamplingExtent = 0;
|
||||
for(ReadSelector storage: readsBySample.values())
|
||||
downsamplingExtent = Math.max(downsamplingExtent,storage.getDownsamplingExtent());
|
||||
for (ReadSelector storage : readsBySample.values())
|
||||
downsamplingExtent = Math.max(downsamplingExtent, storage.getDownsamplingExtent());
|
||||
return downsamplingExtent;
|
||||
}
|
||||
|
||||
|
||||
public Collection<SAMRecord> getSelectedReads() {
|
||||
throw new UnsupportedOperationException("Cannot directly get selected reads from a read partitioner.");
|
||||
}
|
||||
|
||||
public ReadSelector getSelectedReads(String sampleName) {
|
||||
if(!readsBySample.containsKey(sampleName))
|
||||
if (!readsBySample.containsKey(sampleName))
|
||||
throw new NoSuchElementException("Sample name not found");
|
||||
return readsBySample.get(sampleName);
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
for(ReadSelector storage: readsBySample.values())
|
||||
for (ReadSelector storage : readsBySample.values())
|
||||
storage.reset();
|
||||
readsSeen = 0;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -90,19 +90,12 @@ public class GATKRunReport {
|
|||
protected static Logger logger = Logger.getLogger(GATKRunReport.class);
|
||||
|
||||
|
||||
// the listing of the fields is somewhat important; this is the order that the simple XML will output them
|
||||
@ElementList(required = true, name = "gatk_header_Information")
|
||||
private List<String> mGATKHeader;
|
||||
|
||||
@Element(required = false, name = "id")
|
||||
private final String id;
|
||||
|
||||
@Element(required = false, name = "exception")
|
||||
private final ExceptionToXML mException;
|
||||
|
||||
@Element(required = true, name = "working_directory")
|
||||
private String currentPath;
|
||||
|
||||
@Element(required = true, name = "start_time")
|
||||
private String startTime = "ND";
|
||||
|
||||
|
|
@ -112,9 +105,6 @@ public class GATKRunReport {
|
|||
@Element(required = true, name = "run_time")
|
||||
private long runTime = 0;
|
||||
|
||||
@Element(required = true, name = "command_line")
|
||||
private String cmdLine = "COULD NOT BE DETERMINED";
|
||||
|
||||
@Element(required = true, name = "walker_name")
|
||||
private String walkerName;
|
||||
|
||||
|
|
@ -127,9 +117,6 @@ public class GATKRunReport {
|
|||
@Element(required = true, name = "max_memory")
|
||||
private long maxMemory;
|
||||
|
||||
@Element(required = true, name = "java_tmp_directory")
|
||||
private String tmpDir;
|
||||
|
||||
@Element(required = true, name = "user_name")
|
||||
private String userName;
|
||||
|
||||
|
|
@ -145,18 +132,13 @@ public class GATKRunReport {
|
|||
@Element(required = true, name = "iterations")
|
||||
private long nIterations;
|
||||
|
||||
@Element(required = true, name = "reads")
|
||||
private long nReads;
|
||||
|
||||
public enum PhoneHomeOption {
|
||||
/** Disable phone home */
|
||||
NO_ET,
|
||||
/** Standard option. Writes to local repository if it can be found, or S3 otherwise */
|
||||
STANDARD,
|
||||
/** Force output to STDOUT. For debugging only */
|
||||
STDOUT,
|
||||
/** Force output to S3. For debugging only */
|
||||
AWS_S3 // todo -- remove me -- really just for testing purposes
|
||||
STDOUT
|
||||
}
|
||||
|
||||
private static final DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH.mm.ss");
|
||||
|
|
@ -174,15 +156,8 @@ public class GATKRunReport {
|
|||
|
||||
logger.debug("Aggregating data for run report");
|
||||
|
||||
mGATKHeader = CommandLineGATK.createApplicationHeader();
|
||||
currentPath = System.getProperty("user.dir");
|
||||
|
||||
// what did we run?
|
||||
id = org.apache.commons.lang.RandomStringUtils.randomAlphanumeric(32);
|
||||
try {
|
||||
cmdLine = engine.createApproximateCommandLineArgumentString(engine, walker);
|
||||
} catch (Exception ignore) { }
|
||||
|
||||
walkerName = engine.getWalkerName(walker.getClass());
|
||||
svnVersion = CommandLineGATK.getVersionNumber();
|
||||
|
||||
|
|
@ -193,7 +168,6 @@ public class GATKRunReport {
|
|||
startTime = dateFormat.format(engine.getStartTime());
|
||||
runTime = (end.getTime() - engine.getStartTime().getTime()) / 1000L; // difference in seconds
|
||||
}
|
||||
tmpDir = System.getProperty("java.io.tmpdir");
|
||||
|
||||
// deal with memory usage
|
||||
Runtime.getRuntime().gc(); // call GC so totalMemory is ~ used memory
|
||||
|
|
@ -204,12 +178,11 @@ public class GATKRunReport {
|
|||
if ( engine.getCumulativeMetrics() != null ) {
|
||||
// it's possible we aborted so early that these data structures arent initialized
|
||||
nIterations = engine.getCumulativeMetrics().getNumIterations();
|
||||
nReads = engine.getCumulativeMetrics().getNumReadsSeen();
|
||||
}
|
||||
|
||||
// user and hostname -- information about the runner of the GATK
|
||||
userName = System.getProperty("user.name");
|
||||
hostName = "unknown"; // resolveHostname();
|
||||
hostName = Utils.resolveHostname();
|
||||
|
||||
// basic java information
|
||||
java = Utils.join("-", Arrays.asList(System.getProperty("java.vendor"), System.getProperty("java.version")));
|
||||
|
|
@ -239,11 +212,8 @@ public class GATKRunReport {
|
|||
case STDOUT:
|
||||
postReportToStream(System.out);
|
||||
break;
|
||||
case AWS_S3:
|
||||
postReportToAWSS3();
|
||||
break;
|
||||
default:
|
||||
exceptDuringRunReport("BUG: unexcepted PhoneHomeOption ");
|
||||
exceptDuringRunReport("BUG: unexpected PhoneHomeOption ");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
@ -264,22 +234,8 @@ public class GATKRunReport {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Opens the destination file and writes a gzipped version of the XML report there.
|
||||
*
|
||||
* @param destination
|
||||
* @throws IOException
|
||||
*/
|
||||
private void postReportToFile(File destination) throws IOException {
|
||||
BufferedOutputStream out =
|
||||
new BufferedOutputStream(
|
||||
new GZIPOutputStream(
|
||||
new FileOutputStream(destination)));
|
||||
try {
|
||||
postReportToStream(out);
|
||||
} finally {
|
||||
out.close();
|
||||
}
|
||||
private final String getKey() {
|
||||
return getID() + ".report.xml.gz";
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -288,16 +244,21 @@ public class GATKRunReport {
|
|||
* That is, postReport() is guarenteed not to fail for any reason.
|
||||
*/
|
||||
private File postReportToLocalDisk(File rootDir) {
|
||||
String filename = getID() + ".report.xml.gz";
|
||||
File file = new File(rootDir, filename);
|
||||
final String filename = getKey();
|
||||
final File destination = new File(rootDir, filename);
|
||||
|
||||
try {
|
||||
postReportToFile(file);
|
||||
logger.debug("Wrote report to " + file);
|
||||
return file;
|
||||
final BufferedOutputStream out = new BufferedOutputStream(
|
||||
new GZIPOutputStream(
|
||||
new FileOutputStream(destination)));
|
||||
postReportToStream(out);
|
||||
out.close();
|
||||
logger.debug("Wrote report to " + destination);
|
||||
return destination;
|
||||
} catch ( Exception e ) {
|
||||
// we catch everything, and no matter what eat the error
|
||||
exceptDuringRunReport("Couldn't read report file", e);
|
||||
file.delete();
|
||||
destination.delete();
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
@ -305,42 +266,46 @@ public class GATKRunReport {
|
|||
private void postReportToAWSS3() {
|
||||
// modifying example code from http://jets3t.s3.amazonaws.com/toolkit/code-samples.html
|
||||
this.hostName = Utils.resolveHostname(); // we want to fill in the host name
|
||||
File localFile = postReportToLocalDisk(new File("./"));
|
||||
logger.debug("Generating GATK report to AWS S3 based on local file " + localFile);
|
||||
if ( localFile != null ) { // we succeeded in creating the local file
|
||||
localFile.deleteOnExit();
|
||||
try {
|
||||
// stop us from printing the annoying, and meaningless, mime types warning
|
||||
Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class);
|
||||
mimeTypeLogger.setLevel(Level.FATAL);
|
||||
final String key = getKey();
|
||||
logger.debug("Generating GATK report to AWS S3 with key " + key);
|
||||
try {
|
||||
// create an byte output stream so we can capture the output as a byte[]
|
||||
final ByteArrayOutputStream byteStream = new ByteArrayOutputStream(8096);
|
||||
final OutputStream outputStream = new GZIPOutputStream(byteStream);
|
||||
postReportToStream(outputStream);
|
||||
outputStream.close();
|
||||
final byte[] report = byteStream.toByteArray();
|
||||
|
||||
// Your Amazon Web Services (AWS) login credentials are required to manage S3 accounts. These credentials
|
||||
// are stored in an AWSCredentials object:
|
||||
// stop us from printing the annoying, and meaningless, mime types warning
|
||||
Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class);
|
||||
mimeTypeLogger.setLevel(Level.FATAL);
|
||||
|
||||
// IAM GATK user credentials -- only right is to PutObject into GATK_Run_Report bucket
|
||||
String awsAccessKey = "AKIAJXU7VIHBPDW4TDSQ"; // GATK AWS user
|
||||
String awsSecretKey = "uQLTduhK6Gy8mbOycpoZIxr8ZoVj1SQaglTWjpYA"; // GATK AWS user
|
||||
AWSCredentials awsCredentials = new AWSCredentials(awsAccessKey, awsSecretKey);
|
||||
// Your Amazon Web Services (AWS) login credentials are required to manage S3 accounts. These credentials
|
||||
// are stored in an AWSCredentials object:
|
||||
|
||||
// To communicate with S3, create a class that implements an S3Service. We will use the REST/HTTP
|
||||
// implementation based on HttpClient, as this is the most robust implementation provided with JetS3t.
|
||||
S3Service s3Service = new RestS3Service(awsCredentials);
|
||||
// IAM GATK user credentials -- only right is to PutObject into GATK_Run_Report bucket
|
||||
String awsAccessKey = "AKIAJXU7VIHBPDW4TDSQ"; // GATK AWS user
|
||||
String awsSecretKey = "uQLTduhK6Gy8mbOycpoZIxr8ZoVj1SQaglTWjpYA"; // GATK AWS user
|
||||
AWSCredentials awsCredentials = new AWSCredentials(awsAccessKey, awsSecretKey);
|
||||
|
||||
// Create an S3Object based on a file, with Content-Length set automatically and
|
||||
// Content-Type set based on the file's extension (using the Mimetypes utility class)
|
||||
S3Object fileObject = new S3Object(localFile);
|
||||
//logger.info("Created S3Object" + fileObject);
|
||||
//logger.info("Uploading " + localFile + " to AWS bucket");
|
||||
S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject);
|
||||
logger.debug("Uploaded to AWS: " + s3Object);
|
||||
logger.info("Uploaded run statistics report to AWS S3");
|
||||
} catch ( S3ServiceException e ) {
|
||||
exceptDuringRunReport("S3 exception occurred", e);
|
||||
} catch ( NoSuchAlgorithmException e ) {
|
||||
exceptDuringRunReport("Couldn't calculate MD5", e);
|
||||
} catch ( IOException e ) {
|
||||
exceptDuringRunReport("Couldn't read report file", e);
|
||||
}
|
||||
// To communicate with S3, create a class that implements an S3Service. We will use the REST/HTTP
|
||||
// implementation based on HttpClient, as this is the most robust implementation provided with JetS3t.
|
||||
S3Service s3Service = new RestS3Service(awsCredentials);
|
||||
|
||||
// Create an S3Object based on a file, with Content-Length set automatically and
|
||||
// Content-Type set based on the file's extension (using the Mimetypes utility class)
|
||||
S3Object fileObject = new S3Object(key, report);
|
||||
//logger.info("Created S3Object" + fileObject);
|
||||
//logger.info("Uploading " + localFile + " to AWS bucket");
|
||||
S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject);
|
||||
logger.debug("Uploaded to AWS: " + s3Object);
|
||||
logger.info("Uploaded run statistics report to AWS S3");
|
||||
} catch ( S3ServiceException e ) {
|
||||
exceptDuringRunReport("S3 exception occurred", e);
|
||||
} catch ( NoSuchAlgorithmException e ) {
|
||||
exceptDuringRunReport("Couldn't calculate MD5", e);
|
||||
} catch ( IOException e ) {
|
||||
exceptDuringRunReport("Couldn't read report file", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -296,6 +296,10 @@ public class GATKReportTable {
|
|||
return primaryKeyColumn.contains(primaryKey);
|
||||
}
|
||||
|
||||
public Collection<Object> getPrimaryKeys() {
|
||||
return Collections.unmodifiableCollection(primaryKeyColumn);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the value for a given position in the table
|
||||
*
|
||||
|
|
|
|||
|
|
@ -0,0 +1,258 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.WalkerManager;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 12/9/11
|
||||
*/
|
||||
|
||||
public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegionWalker<M,T>,LocusShardDataProvider> {
|
||||
/**
|
||||
* our log, which we want to capture anything from this class
|
||||
*/
|
||||
protected static Logger logger = Logger.getLogger(TraversalEngine.class);
|
||||
|
||||
private final Queue<ActiveRegion> workQueue = new LinkedList<ActiveRegion>();
|
||||
private final LinkedHashSet<GATKSAMRecord> myReads = new LinkedHashSet<GATKSAMRecord>();
|
||||
|
||||
@Override
|
||||
protected String getTraversalType() {
|
||||
return "active regions";
|
||||
}
|
||||
|
||||
@Override
|
||||
public T traverse( final ActiveRegionWalker<M,T> walker,
|
||||
final LocusShardDataProvider dataProvider,
|
||||
T sum) {
|
||||
logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider));
|
||||
|
||||
final LocusView locusView = getLocusView( walker, dataProvider );
|
||||
final GenomeLocSortedSet initialIntervals = engine.getIntervals(); // BUGBUG: unfortunate inefficiency that needs to be removed
|
||||
|
||||
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
|
||||
final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
|
||||
|
||||
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
|
||||
|
||||
int minStart = Integer.MAX_VALUE;
|
||||
final ArrayList<Double> isActiveList = new ArrayList<Double>();
|
||||
GenomeLoc firstIsActiveStart = null;
|
||||
|
||||
//ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
|
||||
ReferenceOrderedView referenceOrderedDataView = null;
|
||||
if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA )
|
||||
referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider );
|
||||
else
|
||||
referenceOrderedDataView = (RodLocusView)locusView;
|
||||
|
||||
// We keep processing while the next reference location is within the interval
|
||||
GenomeLoc prevLoc = null;
|
||||
while( locusView.hasNext() ) {
|
||||
final AlignmentContext locus = locusView.next();
|
||||
GenomeLoc location = locus.getLocation();
|
||||
if(prevLoc != null) {
|
||||
for(int iii = prevLoc.getStart() + 1; iii < location.getStart(); iii++ ) {
|
||||
final GenomeLoc fakeLoc = engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii);
|
||||
if( initialIntervals == null || initialIntervals.overlaps( fakeLoc ) ) {
|
||||
final double isActiveProb = ( walker.presetActiveRegions == null ? 0.0 : ( walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0 ) );
|
||||
isActiveList.add( isActiveProb );
|
||||
if( firstIsActiveStart == null ) {
|
||||
firstIsActiveStart = fakeLoc;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dataProvider.getShard().getReadMetrics().incrementNumIterations();
|
||||
|
||||
// create reference context. Note that if we have a pileup of "extended events", the context will
|
||||
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
|
||||
final ReferenceContext refContext = referenceView.getReferenceContext(location);
|
||||
|
||||
// Iterate forward to get all reference ordered data covering this location
|
||||
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
|
||||
|
||||
// Call the walkers isActive function for this locus and add them to the list to be integrated later
|
||||
if( initialIntervals == null || initialIntervals.overlaps( location ) ) {
|
||||
final double isActiveProb = ( walker.presetActiveRegions == null ? walker.isActive( tracker, refContext, locus )
|
||||
: ( walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0 ) );
|
||||
isActiveList.add( isActiveProb );
|
||||
if( firstIsActiveStart == null ) {
|
||||
firstIsActiveStart = location;
|
||||
}
|
||||
}
|
||||
|
||||
// Grab all the previously unseen reads from this pileup and add them to the massive read list
|
||||
for( final PileupElement p : locus.getBasePileup() ) {
|
||||
final GATKSAMRecord read = p.getRead();
|
||||
if( !myReads.contains(read) ) {
|
||||
myReads.add(read);
|
||||
}
|
||||
}
|
||||
|
||||
// If this is the last pileup for this shard calculate the minimum alignment start so that we know
|
||||
// which active regions in the work queue are now safe to process
|
||||
if( !locusView.hasNext() ) {
|
||||
for( final PileupElement p : locus.getBasePileup() ) {
|
||||
final GATKSAMRecord read = p.getRead();
|
||||
if( !myReads.contains(read) ) {
|
||||
myReads.add(read);
|
||||
}
|
||||
if( read.getAlignmentStart() < minStart ) { minStart = read.getAlignmentStart(); }
|
||||
}
|
||||
}
|
||||
prevLoc = location;
|
||||
printProgress(dataProvider.getShard(), locus.getLocation());
|
||||
}
|
||||
|
||||
// Take the individual isActive calls and integrate them into contiguous active regions and
|
||||
// add these blocks of work to the work queue
|
||||
final ArrayList<ActiveRegion> activeRegions = integrateActiveList( isActiveList, firstIsActiveStart, activeRegionExtension, walker.presetActiveRegions != null );
|
||||
logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." );
|
||||
if( walker.activeRegionOutStream == null ) {
|
||||
workQueue.addAll( activeRegions );
|
||||
} else { // Just want to output the active regions to a file, not actually process them
|
||||
for( final ActiveRegion activeRegion : activeRegions ) {
|
||||
if( activeRegion.isActive ) {
|
||||
walker.activeRegionOutStream.println( activeRegion.getLocation() );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
|
||||
while( workQueue.peek() != null && (workQueue.peek().getExtendedLoc().getStop() < minStart || !workQueue.peek().getExtendedLoc().getContig().equals(dataProvider.getLocus().getContig())) ) {
|
||||
final ActiveRegion activeRegion = workQueue.remove();
|
||||
sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker );
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
// Special function called in LinearMicroScheduler to empty out the work queue. Ugly for now but will be cleaned up when we push this functionality more into the engine
|
||||
public T endTraversal( final Walker<M,T> walker, T sum) {
|
||||
while( workQueue.peek() != null ) {
|
||||
final ActiveRegion activeRegion = workQueue.remove();
|
||||
sum = processActiveRegion( activeRegion, myReads, workQueue, sum, (ActiveRegionWalker<M,T>) walker );
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
private T processActiveRegion( final ActiveRegion activeRegion, final LinkedHashSet<GATKSAMRecord> reads, final Queue<ActiveRegion> workQueue, final T sum, final ActiveRegionWalker<M,T> walker ) {
|
||||
final ArrayList<GATKSAMRecord> placedReads = new ArrayList<GATKSAMRecord>();
|
||||
for( final GATKSAMRecord read : reads ) {
|
||||
final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read );
|
||||
if( activeRegion.getLocation().overlapsP( readLoc ) ) {
|
||||
// The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region)
|
||||
long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc );
|
||||
ActiveRegion bestRegion = activeRegion;
|
||||
for( final ActiveRegion otherRegionToTest : workQueue ) {
|
||||
if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) {
|
||||
maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc );
|
||||
bestRegion = otherRegionToTest;
|
||||
}
|
||||
}
|
||||
bestRegion.add( read );
|
||||
|
||||
// The read is also added to all other regions in which it overlaps but marked as non-primary
|
||||
if( walker.wantsNonPrimaryReads() ) {
|
||||
if( !bestRegion.equals(activeRegion) ) {
|
||||
activeRegion.add( read );
|
||||
}
|
||||
for( final ActiveRegion otherRegionToTest : workQueue ) {
|
||||
if( !bestRegion.equals(otherRegionToTest) && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) {
|
||||
otherRegionToTest.add( read );
|
||||
}
|
||||
}
|
||||
}
|
||||
placedReads.add( read );
|
||||
} else if( activeRegion.getExtendedLoc().overlapsP( readLoc ) && walker.wantsNonPrimaryReads() ) {
|
||||
activeRegion.add( read );
|
||||
}
|
||||
}
|
||||
reads.removeAll( placedReads ); // remove all the reads which have been placed into their active region
|
||||
|
||||
logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc());
|
||||
final M x = walker.map( activeRegion, null );
|
||||
return walker.reduce( x, sum );
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the best view of loci for this walker given the available data.
|
||||
* @param walker walker to interrogate.
|
||||
* @param dataProvider Data which which to drive the locus view.
|
||||
* @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal.
|
||||
*/
|
||||
private LocusView getLocusView( final Walker<M,T> walker, final LocusShardDataProvider dataProvider ) {
|
||||
final DataSource dataSource = WalkerManager.getWalkerDataSource(walker);
|
||||
if( dataSource == DataSource.READS )
|
||||
return new CoveredLocusView(dataProvider);
|
||||
else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers )
|
||||
return new AllLocusView(dataProvider);
|
||||
else if( dataSource == DataSource.REFERENCE_ORDERED_DATA )
|
||||
return new RodLocusView(dataProvider);
|
||||
else
|
||||
throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource);
|
||||
}
|
||||
|
||||
// band-pass filter the list of isActive probabilities and turn into active regions
|
||||
private ArrayList<ActiveRegion> integrateActiveList( final ArrayList<Double> activeList, final GenomeLoc firstIsActiveStart, final int activeRegionExtension, final boolean presetRegions ) {
|
||||
|
||||
final double ACTIVE_PROB_THRESHOLD = 0.2; // BUGBUG: needs to be set-able by the walker author
|
||||
final ArrayList<ActiveRegion> returnList = new ArrayList<ActiveRegion>();
|
||||
if( activeList.size() == 0 ) {
|
||||
return returnList;
|
||||
} else if( activeList.size() == 1 ) {
|
||||
returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart(), firstIsActiveStart.getStart()),
|
||||
activeList.get(0) > ACTIVE_PROB_THRESHOLD, engine.getGenomeLocParser(), activeRegionExtension ) );
|
||||
return returnList;
|
||||
} else {
|
||||
final Double[] activeProbArray = activeList.toArray(new Double[activeList.size()]);
|
||||
final double[] filteredProbArray = new double[activeProbArray.length];
|
||||
final int FILTER_SIZE = ( presetRegions ? 0 : 50 ); // BUGBUG: needs to be set-able by the walker author
|
||||
final int MAX_ACTIVE_REGION = ( presetRegions ? 16001 : 425 ); // BUGBUG: needs to be set-able by the walker author
|
||||
for( int iii = 0; iii < activeProbArray.length; iii++ ) {
|
||||
double maxVal = 0;
|
||||
for( int jjj = Math.max(0, iii-FILTER_SIZE); jjj < Math.min(activeList.size(), iii+FILTER_SIZE+1); jjj++ ) {
|
||||
if( activeProbArray[jjj] > maxVal ) { maxVal = activeProbArray[jjj]; }
|
||||
}
|
||||
filteredProbArray[iii] = maxVal;
|
||||
}
|
||||
|
||||
boolean curStatus = filteredProbArray[0] > ACTIVE_PROB_THRESHOLD;
|
||||
int curStart = 0;
|
||||
for(int iii = 1; iii < filteredProbArray.length; iii++ ) {
|
||||
final boolean thisStatus = filteredProbArray[iii] > ACTIVE_PROB_THRESHOLD;
|
||||
if( curStatus != thisStatus || (iii-curStart) > MAX_ACTIVE_REGION ) {
|
||||
returnList.add( new ActiveRegion(
|
||||
engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (iii-1)),
|
||||
curStatus, engine.getGenomeLocParser(), activeRegionExtension ) );
|
||||
curStatus = thisStatus;
|
||||
curStart = iii;
|
||||
}
|
||||
}
|
||||
if( curStart != filteredProbArray.length-1 ) {
|
||||
returnList.add( new ActiveRegion(
|
||||
engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (filteredProbArray.length-1)),
|
||||
curStatus, engine.getGenomeLocParser(), activeRegionExtension ) );
|
||||
}
|
||||
return returnList;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -102,7 +102,9 @@ public class TraverseLoci<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,Locu
|
|||
}
|
||||
|
||||
/**
|
||||
* Gets the best view of loci for this walker given the available data.
|
||||
* Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track'
|
||||
* of sorts, providing a consistent interface so that TraverseLoci doesn't need to be reimplemented for any new datatype
|
||||
* that comes along.
|
||||
* @param walker walker to interrogate.
|
||||
* @param dataProvider Data which which to drive the locus view.
|
||||
* @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal.
|
||||
|
|
|
|||
|
|
@ -0,0 +1,19 @@
|
|||
package org.broadinstitute.sting.gatk.walkers;
|
||||
|
||||
import java.lang.annotation.Documented;
|
||||
import java.lang.annotation.Inherited;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.RetentionPolicy;
|
||||
|
||||
/**
|
||||
* Describes the size of the buffer region that is added to each active region when pulling in covered reads.
|
||||
* User: rpoplin
|
||||
* Date: 1/18/12
|
||||
*/
|
||||
@Documented
|
||||
@Inherited
|
||||
@Retention(RetentionPolicy.RUNTIME)
|
||||
|
||||
public @interface ActiveRegionExtension {
|
||||
public int extension() default 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,90 @@
|
|||
package org.broadinstitute.sting.gatk.walkers;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.IntervalBinding;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Base class for all the Active Region Walkers.
|
||||
* User: rpoplin
|
||||
* Date: 12/7/11
|
||||
*/
|
||||
|
||||
@By(DataSource.READS)
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE_BASES})
|
||||
@PartitionBy(PartitionType.READ)
|
||||
@ActiveRegionExtension(extension=50)
|
||||
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class})
|
||||
public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
|
||||
|
||||
@Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this interval list file", required = false)
|
||||
public PrintStream activeRegionOutStream = null;
|
||||
|
||||
@Input(fullName="activeRegionIn", shortName="AR", doc="Use this interval list file as the active regions to process", required = false)
|
||||
protected List<IntervalBinding<Feature>> activeRegionBindings = null;
|
||||
|
||||
public GenomeLocSortedSet presetActiveRegions = null;
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
if( activeRegionBindings == null ) { return; }
|
||||
List<GenomeLoc> allIntervals = new ArrayList<GenomeLoc>(0);
|
||||
for ( IntervalBinding intervalBinding : activeRegionBindings ) {
|
||||
List<GenomeLoc> intervals = intervalBinding.getIntervals(this.getToolkit());
|
||||
|
||||
if ( intervals.isEmpty() ) {
|
||||
logger.warn("The interval file " + intervalBinding.getSource() + " contains no intervals that could be parsed.");
|
||||
}
|
||||
|
||||
allIntervals = IntervalUtils.mergeListsBySetOperator(intervals, allIntervals, IntervalSetRule.UNION);
|
||||
}
|
||||
|
||||
presetActiveRegions = IntervalUtils.sortAndMergeIntervals(this.getToolkit().getGenomeLocParser(), allIntervals, IntervalMergingRule.ALL);
|
||||
}
|
||||
|
||||
// Do we actually want to operate on the context?
|
||||
public boolean filter(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
|
||||
return true; // We are keeping all the reads
|
||||
}
|
||||
|
||||
public boolean wantsNonPrimaryReads() {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Determine probability of active status over the AlignmentContext
|
||||
public abstract double isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context);
|
||||
|
||||
// Map over the ActiveRegion
|
||||
public abstract MapType map(final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker);
|
||||
|
||||
public final GenomeLocSortedSet extendIntervals( final GenomeLocSortedSet intervals, final GenomeLocParser genomeLocParser, IndexedFastaSequenceFile reference ) {
|
||||
final int activeRegionExtension = this.getClass().getAnnotation(ActiveRegionExtension.class).extension();
|
||||
final List<GenomeLoc> allIntervals = new ArrayList<GenomeLoc>();
|
||||
for( final GenomeLoc interval : intervals.toList() ) {
|
||||
final int start = Math.max( 1, interval.getStart() - activeRegionExtension );
|
||||
final int stop = Math.min( reference.getSequenceDictionary().getSequence(interval.getContig()).getSequenceLength(), interval.getStop() + activeRegionExtension );
|
||||
allIntervals.add( genomeLocParser.createGenomeLoc(interval.getContig(), start, stop) );
|
||||
}
|
||||
return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, IntervalMergingRule.ALL);
|
||||
}
|
||||
}
|
||||
|
|
@ -30,18 +30,19 @@ import net.sf.samtools.SAMReadGroupRecord;
|
|||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Collection;
|
||||
import java.util.Random;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/**
|
||||
* Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file.
|
||||
*
|
||||
|
|
@ -70,12 +71,21 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
|||
* -I input2.bam \
|
||||
* --read_filter MappingQualityZero
|
||||
*
|
||||
* // Prints the first 2000 reads in the BAM file
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T PrintReads \
|
||||
* -o output.bam \
|
||||
* -I input.bam \
|
||||
* -n 2000
|
||||
*
|
||||
* // Downsamples BAM file to 25%
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T PrintReads \
|
||||
* -o output.bam \
|
||||
* -I input.bam \
|
||||
* -ds 0.25
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
|
|
@ -95,9 +105,18 @@ public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
|||
@Argument(fullName = "platform", shortName = "platform", doc="Exclude all reads with this platform from the output", required = false)
|
||||
String platform = null;
|
||||
|
||||
/**
|
||||
* Only prints the first n reads of the file
|
||||
*/
|
||||
@Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false)
|
||||
int nReadsToPrint = -1;
|
||||
|
||||
/**
|
||||
* Downsamples the bam file by the given ratio, printing only approximately the given percentage of reads. The downsampling is balanced (over the entire coverage)
|
||||
*/
|
||||
@Argument(fullName = "downsample_coverage", shortName = "ds", doc="Downsample BAM to desired coverage", required = false)
|
||||
public double downsampleRatio = 1.0;
|
||||
|
||||
/**
|
||||
* Only reads from samples listed in the provided file(s) will be included in the output.
|
||||
*/
|
||||
|
|
@ -112,6 +131,8 @@ public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
|||
|
||||
private TreeSet<String> samplesToChoose = new TreeSet<String>();
|
||||
private boolean SAMPLES_SPECIFIED = false;
|
||||
|
||||
Random random;
|
||||
|
||||
/**
|
||||
* The initialize function.
|
||||
|
|
@ -132,13 +153,15 @@ public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
|||
if(!samplesToChoose.isEmpty()) {
|
||||
SAMPLES_SPECIFIED = true;
|
||||
}
|
||||
|
||||
random = GenomeAnalysisEngine.getRandomGenerator();
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* The reads filter function.
|
||||
*
|
||||
* @param ref the reference bases that correspond to our read, if a reference was provided
|
||||
* @param ref the reference bases that correspond to our read, if a reference was provided
|
||||
* @param read the read itself, as a SAMRecord
|
||||
* @return true if the read passes the filter, false if it doesn't
|
||||
*/
|
||||
|
|
@ -177,13 +200,14 @@ public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
|||
nReadsToPrint--; // n > 0 means there are still reads to be printed.
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
// if downsample option is turned off (= 1) then don't waste time getting the next random number.
|
||||
return (downsampleRatio == 1 || random.nextDouble() < downsampleRatio);
|
||||
}
|
||||
|
||||
/**
|
||||
* The reads map function.
|
||||
*
|
||||
* @param ref the reference bases that correspond to our read, if a reference was provided
|
||||
* @param ref the reference bases that correspond to our read, if a reference was provided
|
||||
* @param read the read itself, as a SAMRecord
|
||||
* @return the read itself
|
||||
*/
|
||||
|
|
@ -194,6 +218,7 @@ public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
|||
/**
|
||||
* reduceInit is called once before any calls to the map function. We use it here to setup the output
|
||||
* bam file, if it was specified on the command line
|
||||
*
|
||||
* @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise
|
||||
*/
|
||||
public SAMFileWriter reduceInit() {
|
||||
|
|
@ -202,7 +227,8 @@ public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
|||
|
||||
/**
|
||||
* given a read and a output location, reduce by emitting the read
|
||||
* @param read the read itself
|
||||
*
|
||||
* @param read the read itself
|
||||
* @param output the output source
|
||||
* @return the SAMFileWriter, so that the next reduce can emit to the same source
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -22,16 +22,16 @@ public class BaseQualityRankSumTest extends RankSumTest {
|
|||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("BaseQRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities")); }
|
||||
|
||||
protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals) {
|
||||
protected void fillQualsFromPileup(byte ref, List<Byte> alts, ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals) {
|
||||
for ( final PileupElement p : pileup ) {
|
||||
if( isUsableBase(p) ) {
|
||||
if ( p.getBase() == ref ) {
|
||||
if ( p.getBase() == ref )
|
||||
refQuals.add((double)p.getQual());
|
||||
} else if ( p.getBase() == alt ) {
|
||||
else if ( alts.contains(p.getBase()) )
|
||||
altQuals.add((double)p.getQual());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals) {
|
||||
// equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ?
|
||||
|
|
@ -57,8 +57,6 @@ public class BaseQualityRankSumTest extends RankSumTest {
|
|||
refQuals.add(-10.0*refLikelihood);
|
||||
else if (altLikelihood > refLikelihood + INDEL_LIKELIHOOD_THRESH)
|
||||
altQuals.add(-10.0*altLikelihood);
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -54,15 +54,15 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
private static final double MIN_PVALUE = 1E-320;
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
if ( ! vc.isVariant() || vc.isFiltered() )
|
||||
if ( !vc.isVariant() )
|
||||
return null;
|
||||
|
||||
int[][] table;
|
||||
|
||||
if (vc.isBiallelic() && vc.isSNP())
|
||||
table = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAlternateAllele(0));
|
||||
else if (vc.isIndel() || vc.isMixed()) {
|
||||
table = getIndelContingencyTable(stratifiedContexts, vc);
|
||||
if ( vc.isSNP() )
|
||||
table = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount());
|
||||
else if ( vc.isIndel() || vc.isMixed() ) {
|
||||
table = getIndelContingencyTable(stratifiedContexts);
|
||||
if (table == null)
|
||||
return null;
|
||||
}
|
||||
|
|
@ -73,7 +73,6 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
if ( pvalue == null )
|
||||
return null;
|
||||
|
||||
// use Math.abs to prevent -0's
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pvalue)));
|
||||
return map;
|
||||
|
|
@ -206,7 +205,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
|
||||
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
|
||||
for (PileupElement p : sample.getValue().getBasePileup()) {
|
||||
if ( p.isDeletion() || p.isReducedRead() ) // ignore deletions and reduced reads
|
||||
if ( p.isDeletion() || p.getRead().isReducedRead() ) // ignore deletions and reduced reads
|
||||
continue;
|
||||
|
||||
if ( p.getRead().getMappingQuality() < 20 || p.getQual() < 20 )
|
||||
|
|
@ -235,7 +234,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
* allele2 # #
|
||||
* @return a 2x2 contingency table
|
||||
*/
|
||||
private static int[][] getIndelContingencyTable(Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
private static int[][] getIndelContingencyTable(Map<String, AlignmentContext> stratifiedContexts) {
|
||||
final double INDEL_LIKELIHOOD_THRESH = 0.3;
|
||||
final HashMap<PileupElement,LinkedHashMap<Allele,Double>> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap();
|
||||
|
||||
|
|
@ -259,7 +258,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
continue;
|
||||
|
||||
for (final PileupElement p: pileup) {
|
||||
if ( p.isReducedRead() ) // ignore reduced reads
|
||||
if ( p.getRead().isReducedRead() ) // ignore reduced reads
|
||||
continue;
|
||||
if ( p.getRead().getMappingQuality() < 20)
|
||||
continue;
|
||||
|
|
|
|||
|
|
@ -24,7 +24,6 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
|
|
@ -43,6 +42,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
|
@ -52,8 +52,7 @@ import java.util.*;
|
|||
/**
|
||||
* Consistency of the site with two (and only two) segregating haplotypes. Higher scores
|
||||
* are indicative of regions with bad alignments, often leading to artifactual SNP and indel calls.
|
||||
* Note that the Haplotype Score is only calculated for sites with read coverage; also, for SNPs, the
|
||||
* site must be bi-allelic.
|
||||
* Note that the Haplotype Score is only calculated for sites with read coverage.
|
||||
*/
|
||||
public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnotation {
|
||||
private final static boolean DEBUG = false;
|
||||
|
|
@ -62,15 +61,12 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
|
|||
private final static char REGEXP_WILDCARD = '.';
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
if (stratifiedContexts.size() == 0 ) // size 0 means that call was made by someone else and we have no data here
|
||||
return null;
|
||||
|
||||
if (vc.isSNP() && !vc.isBiallelic())
|
||||
if (stratifiedContexts.size() == 0) // size 0 means that call was made by someone else and we have no data here
|
||||
return null;
|
||||
|
||||
final AlignmentContext context = AlignmentContextUtils.joinContexts(stratifiedContexts.values());
|
||||
|
||||
final int contextWingSize = Math.min(((int)ref.getWindow().size() - 1)/2, MIN_CONTEXT_WING_SIZE);
|
||||
final int contextWingSize = Math.min((ref.getWindow().size() - 1) / 2, MIN_CONTEXT_WING_SIZE);
|
||||
final int contextSize = contextWingSize * 2 + 1;
|
||||
|
||||
final int locus = ref.getLocus().getStart() + (ref.getLocus().getStop() - ref.getLocus().getStart()) / 2;
|
||||
|
|
@ -84,14 +80,14 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
|
|||
|
||||
if (pileup == null)
|
||||
return null;
|
||||
|
||||
|
||||
final List<Haplotype> haplotypes = computeHaplotypes(pileup, contextSize, locus, vc);
|
||||
|
||||
final MathUtils.RunningAverage scoreRA = new MathUtils.RunningAverage();
|
||||
final MathUtils.RunningAverage scoreRA = new MathUtils.RunningAverage();
|
||||
if (haplotypes != null) {
|
||||
for ( final Genotype genotype : vc.getGenotypes()) {
|
||||
for (final Genotype genotype : vc.getGenotypes()) {
|
||||
final AlignmentContext thisContext = stratifiedContexts.get(genotype.getSampleName());
|
||||
if ( thisContext != null ) {
|
||||
if (thisContext != null) {
|
||||
final ReadBackedPileup thisPileup;
|
||||
if (thisContext.hasExtendedEventPileup())
|
||||
thisPileup = thisContext.getExtendedEventPileup();
|
||||
|
|
@ -102,14 +98,13 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
|
|||
|
||||
if (thisPileup != null) {
|
||||
if (vc.isSNP())
|
||||
scoreRA.add( scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus) ); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense
|
||||
scoreRA.add(scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus)); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense
|
||||
else if (vc.isIndel() || vc.isMixed()) {
|
||||
Double d = scoreIndelsAgainstHaplotypes(thisPileup);
|
||||
if (d == null)
|
||||
return null;
|
||||
scoreRA.add( d ); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense
|
||||
}
|
||||
else
|
||||
scoreRA.add(d); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense
|
||||
} else
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
@ -122,12 +117,12 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
|
|||
return map;
|
||||
}
|
||||
|
||||
private class HaplotypeComparator implements Comparator<Haplotype>{
|
||||
private class HaplotypeComparator implements Comparator<Haplotype> {
|
||||
|
||||
public int compare(Haplotype a, Haplotype b) {
|
||||
if (a.getQualitySum() < b.getQualitySum())
|
||||
return 1;
|
||||
if (a.getQualitySum() > b.getQualitySum()){
|
||||
if (a.getQualitySum() > b.getQualitySum()) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
|
|
@ -137,39 +132,38 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
|
|||
private List<Haplotype> computeHaplotypes(final ReadBackedPileup pileup, final int contextSize, final int locus, final VariantContext vc) {
|
||||
// Compute all possible haplotypes consistent with current pileup
|
||||
|
||||
int haplotypesToCompute = vc.getAlternateAlleles().size()+1;
|
||||
int haplotypesToCompute = vc.getAlternateAlleles().size() + 1;
|
||||
|
||||
final PriorityQueue<Haplotype> candidateHaplotypeQueue = new PriorityQueue<Haplotype>(100, new HaplotypeComparator());
|
||||
final PriorityQueue<Haplotype> consensusHaplotypeQueue = new PriorityQueue<Haplotype>(MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER, new HaplotypeComparator());
|
||||
|
||||
for ( final PileupElement p : pileup ) {
|
||||
for (final PileupElement p : pileup) {
|
||||
final Haplotype haplotypeFromRead = getHaplotypeFromRead(p, contextSize, locus);
|
||||
candidateHaplotypeQueue.add(haplotypeFromRead);
|
||||
}
|
||||
|
||||
// Now that priority queue has been built with all reads at context, we need to merge and find possible segregating haplotypes
|
||||
Haplotype elem;
|
||||
while ((elem = candidateHaplotypeQueue.poll()) != null) {
|
||||
while ((elem = candidateHaplotypeQueue.poll()) != null) {
|
||||
boolean foundHaplotypeMatch = false;
|
||||
Haplotype lastCheckedHaplotype = null;
|
||||
for ( final Haplotype haplotypeFromList : consensusHaplotypeQueue ) {
|
||||
for (final Haplotype haplotypeFromList : consensusHaplotypeQueue) {
|
||||
final Haplotype consensusHaplotype = getConsensusHaplotype(elem, haplotypeFromList);
|
||||
if (consensusHaplotype != null) {
|
||||
if (consensusHaplotype != null) {
|
||||
foundHaplotypeMatch = true;
|
||||
if (consensusHaplotype.getQualitySum() > haplotypeFromList.getQualitySum()) {
|
||||
consensusHaplotypeQueue.remove(haplotypeFromList);
|
||||
consensusHaplotypeQueue.add(consensusHaplotype);
|
||||
}
|
||||
break;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
lastCheckedHaplotype = haplotypeFromList;
|
||||
}
|
||||
}
|
||||
|
||||
if (!foundHaplotypeMatch && consensusHaplotypeQueue.size() < MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER) {
|
||||
consensusHaplotypeQueue.add(elem);
|
||||
} else if (!foundHaplotypeMatch && lastCheckedHaplotype != null && elem.getQualitySum() > lastCheckedHaplotype.getQualitySum() ) {
|
||||
} else if (!foundHaplotypeMatch && lastCheckedHaplotype != null && elem.getQualitySum() > lastCheckedHaplotype.getQualitySum()) {
|
||||
consensusHaplotypeQueue.remove(lastCheckedHaplotype);
|
||||
consensusHaplotypeQueue.add(elem);
|
||||
}
|
||||
|
|
@ -180,12 +174,14 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
|
|||
// The consensus haplotypes are in a quality-ordered priority queue, so the best haplotypes are just the ones at the front of the queue
|
||||
final Haplotype haplotype1 = consensusHaplotypeQueue.poll();
|
||||
|
||||
List<Haplotype>hlist = new ArrayList<Haplotype>();
|
||||
List<Haplotype> hlist = new ArrayList<Haplotype>();
|
||||
hlist.add(new Haplotype(haplotype1.getBases(), 60));
|
||||
|
||||
for (int k=1; k < haplotypesToCompute; k++) {
|
||||
for (int k = 1; k < haplotypesToCompute; k++) {
|
||||
Haplotype haplotype2 = consensusHaplotypeQueue.poll();
|
||||
if(haplotype2 == null ) { haplotype2 = haplotype1; } // Sometimes only the reference haplotype can be found
|
||||
if (haplotype2 == null) {
|
||||
haplotype2 = haplotype1;
|
||||
} // Sometimes only the reference haplotype can be found
|
||||
hlist.add(new Haplotype(haplotype2.getBases(), 20));
|
||||
}
|
||||
return hlist;
|
||||
|
|
@ -194,36 +190,43 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
|
|||
}
|
||||
|
||||
private Haplotype getHaplotypeFromRead(final PileupElement p, final int contextSize, final int locus) {
|
||||
final SAMRecord read = p.getRead();
|
||||
final GATKSAMRecord read = p.getRead();
|
||||
int readOffsetFromPileup = p.getOffset();
|
||||
|
||||
final byte[] haplotypeBases = new byte[contextSize];
|
||||
Arrays.fill(haplotypeBases, (byte)REGEXP_WILDCARD);
|
||||
Arrays.fill(haplotypeBases, (byte) REGEXP_WILDCARD);
|
||||
final double[] baseQualities = new double[contextSize];
|
||||
Arrays.fill(baseQualities, 0.0);
|
||||
|
||||
byte[] readBases = read.getReadBases();
|
||||
readBases = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readBases); // Adjust the read bases based on the Cigar string
|
||||
readBases = AlignmentUtils.readToAlignmentByteArray(read.getCigar(), readBases); // Adjust the read bases based on the Cigar string
|
||||
byte[] readQuals = read.getBaseQualities();
|
||||
readQuals = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string
|
||||
readQuals = AlignmentUtils.readToAlignmentByteArray(read.getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string
|
||||
|
||||
readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), readOffsetFromPileup, p.getRead().getAlignmentStart(), locus);
|
||||
final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1)/2;
|
||||
readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(read.getCigar(), p, read.getAlignmentStart(), locus);
|
||||
final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2;
|
||||
|
||||
for (int i = 0; i < contextSize; i++ ) {
|
||||
for (int i = 0; i < contextSize; i++) {
|
||||
final int baseOffset = i + baseOffsetStart;
|
||||
if ( baseOffset < 0 ) {
|
||||
if (baseOffset < 0) {
|
||||
continue;
|
||||
}
|
||||
if ( baseOffset >= readBases.length ) {
|
||||
if (baseOffset >= readBases.length) {
|
||||
break;
|
||||
}
|
||||
if( readQuals[baseOffset] == PileupElement.DELETION_BASE) { readQuals[baseOffset] = PileupElement.DELETION_QUAL; }
|
||||
if( !BaseUtils.isRegularBase(readBases[baseOffset]) ) { readBases[baseOffset] = (byte)REGEXP_WILDCARD; readQuals[baseOffset] = (byte) 0; } // N's shouldn't be treated as distinct bases
|
||||
readQuals[baseOffset] = (byte)Math.min((int)readQuals[baseOffset], p.getMappingQual());
|
||||
if( ((int)readQuals[baseOffset]) < 5 ) { readQuals[baseOffset] = (byte) 0; } // quals less than 5 are used as codes and don't have actual probabilistic meaning behind them
|
||||
if (readQuals[baseOffset] == PileupElement.DELETION_BASE) {
|
||||
readQuals[baseOffset] = PileupElement.DELETION_QUAL;
|
||||
}
|
||||
if (!BaseUtils.isRegularBase(readBases[baseOffset])) {
|
||||
readBases[baseOffset] = (byte) REGEXP_WILDCARD;
|
||||
readQuals[baseOffset] = (byte) 0;
|
||||
} // N's shouldn't be treated as distinct bases
|
||||
readQuals[baseOffset] = (byte) Math.min((int) readQuals[baseOffset], p.getMappingQual());
|
||||
if (((int) readQuals[baseOffset]) < 5) {
|
||||
readQuals[baseOffset] = (byte) 0;
|
||||
} // quals less than 5 are used as codes and don't have actual probabilistic meaning behind them
|
||||
haplotypeBases[i] = readBases[baseOffset];
|
||||
baseQualities[i] = (double)readQuals[baseOffset];
|
||||
baseQualities[i] = (double) readQuals[baseOffset];
|
||||
}
|
||||
|
||||
return new Haplotype(haplotypeBases, baseQualities);
|
||||
|
|
@ -238,7 +241,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
|
|||
}
|
||||
|
||||
byte chA, chB;
|
||||
final byte wc = (byte)REGEXP_WILDCARD;
|
||||
final byte wc = (byte) REGEXP_WILDCARD;
|
||||
|
||||
final int length = a.length;
|
||||
final byte[] consensusChars = new byte[length];
|
||||
|
|
@ -247,7 +250,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
|
|||
final double[] qualsA = haplotypeA.getQuals();
|
||||
final double[] qualsB = haplotypeB.getQuals();
|
||||
|
||||
for (int i=0; i < length; i++) {
|
||||
for (int i = 0; i < length; i++) {
|
||||
chA = a[i];
|
||||
chB = b[i];
|
||||
|
||||
|
|
@ -257,17 +260,15 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
|
|||
if ((chA == wc) && (chB == wc)) {
|
||||
consensusChars[i] = wc;
|
||||
consensusQuals[i] = 0.0;
|
||||
}
|
||||
else if ((chA == wc)) {
|
||||
} else if ((chA == wc)) {
|
||||
consensusChars[i] = chB;
|
||||
consensusQuals[i] = qualsB[i];
|
||||
}
|
||||
else if ((chB == wc)){
|
||||
} else if ((chB == wc)) {
|
||||
consensusChars[i] = chA;
|
||||
consensusQuals[i] = qualsA[i];
|
||||
} else {
|
||||
consensusChars[i] = chA;
|
||||
consensusQuals[i] = qualsA[i]+qualsB[i];
|
||||
consensusQuals[i] = qualsA[i] + qualsB[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -276,31 +277,33 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
|
|||
|
||||
// calculate the haplotype scores by walking over all reads and comparing them to the haplotypes
|
||||
private double scoreReadsAgainstHaplotypes(final List<Haplotype> haplotypes, final ReadBackedPileup pileup, final int contextSize, final int locus) {
|
||||
if ( DEBUG ) System.out.printf("HAP1: %s%n", haplotypes.get(0));
|
||||
if ( DEBUG ) System.out.printf("HAP2: %s%n", haplotypes.get(1));
|
||||
if (DEBUG) System.out.printf("HAP1: %s%n", haplotypes.get(0));
|
||||
if (DEBUG) System.out.printf("HAP2: %s%n", haplotypes.get(1));
|
||||
|
||||
final ArrayList<double[]> haplotypeScores = new ArrayList<double[]>();
|
||||
for ( final PileupElement p : pileup ) {
|
||||
for (final PileupElement p : pileup) {
|
||||
// Score all the reads in the pileup, even the filtered ones
|
||||
final double[] scores = new double[haplotypes.size()];
|
||||
for ( int i = 0; i < haplotypes.size(); i++ ) {
|
||||
for (int i = 0; i < haplotypes.size(); i++) {
|
||||
final Haplotype haplotype = haplotypes.get(i);
|
||||
final double score = scoreReadAgainstHaplotype(p, contextSize, haplotype, locus);
|
||||
scores[i] = score;
|
||||
if ( DEBUG ) { System.out.printf(" vs. haplotype %d = %f%n", i, score); }
|
||||
if (DEBUG) {
|
||||
System.out.printf(" vs. haplotype %d = %f%n", i, score);
|
||||
}
|
||||
}
|
||||
haplotypeScores.add(scores);
|
||||
}
|
||||
|
||||
double overallScore = 0.0;
|
||||
for ( final double[] readHaplotypeScores : haplotypeScores ) {
|
||||
for (final double[] readHaplotypeScores : haplotypeScores) {
|
||||
overallScore += MathUtils.arrayMin(readHaplotypeScores);
|
||||
}
|
||||
|
||||
return overallScore;
|
||||
}
|
||||
|
||||
private double scoreReadAgainstHaplotype(final PileupElement p, final int contextSize, final Haplotype haplotype, final int locus ) {
|
||||
private double scoreReadAgainstHaplotype(final PileupElement p, final int contextSize, final Haplotype haplotype, final int locus) {
|
||||
double expected = 0.0;
|
||||
double mismatches = 0.0;
|
||||
|
||||
|
|
@ -315,33 +318,35 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
|
|||
// the chance that it is actually a mismatch is 1 - e, since any of the other 3 options would be a mismatch.
|
||||
// so the probability-weighted mismatch rate is sum_i ( matched ? e_i / 3 : 1 - e_i ) for i = 1 ... n
|
||||
final byte[] haplotypeBases = haplotype.getBases();
|
||||
final SAMRecord read = p.getRead();
|
||||
final GATKSAMRecord read = p.getRead();
|
||||
byte[] readBases = read.getReadBases();
|
||||
|
||||
readBases = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readBases); // Adjust the read bases based on the Cigar string
|
||||
byte[] readQuals = read.getBaseQualities();
|
||||
readQuals = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string
|
||||
int readOffsetFromPileup = p.getOffset();
|
||||
readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), readOffsetFromPileup, p.getRead().getAlignmentStart(), locus);
|
||||
final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1)/2;
|
||||
readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, read.getAlignmentStart(), locus);
|
||||
final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2;
|
||||
|
||||
for ( int i = 0; i < contextSize; i++ ) {
|
||||
for (int i = 0; i < contextSize; i++) {
|
||||
final int baseOffset = i + baseOffsetStart;
|
||||
if ( baseOffset < 0 ) {
|
||||
if (baseOffset < 0) {
|
||||
continue;
|
||||
}
|
||||
if ( baseOffset >= readBases.length ) {
|
||||
if (baseOffset >= readBases.length) {
|
||||
break;
|
||||
}
|
||||
|
||||
final byte haplotypeBase = haplotypeBases[i];
|
||||
final byte readBase = readBases[baseOffset];
|
||||
|
||||
final boolean matched = ( readBase == haplotypeBase || haplotypeBase == (byte)REGEXP_WILDCARD );
|
||||
final boolean matched = (readBase == haplotypeBase || haplotypeBase == (byte) REGEXP_WILDCARD);
|
||||
byte qual = readQuals[baseOffset];
|
||||
if( qual == PileupElement.DELETION_BASE ) { qual = PileupElement.DELETION_QUAL; } // calcAlignmentByteArrayOffset fills the readQuals array with DELETION_BASE at deletions
|
||||
qual = (byte)Math.min((int)qual, p.getMappingQual());
|
||||
if( ((int) qual) >= 5 ) { // quals less than 5 are used as codes and don't have actual probabilistic meaning behind them
|
||||
if (qual == PileupElement.DELETION_BASE) {
|
||||
qual = PileupElement.DELETION_QUAL;
|
||||
} // calcAlignmentByteArrayOffset fills the readQuals array with DELETION_BASE at deletions
|
||||
qual = (byte) Math.min((int) qual, p.getMappingQual());
|
||||
if (((int) qual) >= 5) { // quals less than 5 are used as codes and don't have actual probabilistic meaning behind them
|
||||
final double e = QualityUtils.qualToErrorProb(qual);
|
||||
expected += e;
|
||||
mismatches += matched ? e : 1.0 - e / 3.0;
|
||||
|
|
@ -355,26 +360,27 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
|
|||
}
|
||||
|
||||
|
||||
|
||||
private Double scoreIndelsAgainstHaplotypes(final ReadBackedPileup pileup) {
|
||||
final ArrayList<double[]> haplotypeScores = new ArrayList<double[]>();
|
||||
|
||||
final HashMap<PileupElement,LinkedHashMap<Allele,Double>> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap();
|
||||
final HashMap<PileupElement, LinkedHashMap<Allele, Double>> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap();
|
||||
|
||||
if (indelLikelihoodMap== null)
|
||||
if (indelLikelihoodMap == null)
|
||||
return null;
|
||||
|
||||
for (final PileupElement p: pileup) {
|
||||
for (final PileupElement p : pileup) {
|
||||
if (indelLikelihoodMap.containsKey(p)) {
|
||||
// retrieve likelihood information corresponding to this read
|
||||
LinkedHashMap<Allele,Double> el = indelLikelihoodMap.get(p);
|
||||
LinkedHashMap<Allele, Double> el = indelLikelihoodMap.get(p);
|
||||
|
||||
// Score all the reads in the pileup, even the filtered ones
|
||||
final double[] scores = new double[el.size()];
|
||||
int i = 0;
|
||||
for (Allele a: el.keySet() ) {
|
||||
for (Allele a : el.keySet()) {
|
||||
scores[i++] = -el.get(a);
|
||||
if ( DEBUG ) { System.out.printf(" vs. haplotype %d = %f%n", i-1, scores[i-1]); }
|
||||
if (DEBUG) {
|
||||
System.out.printf(" vs. haplotype %d = %f%n", i - 1, scores[i - 1]);
|
||||
}
|
||||
}
|
||||
|
||||
haplotypeScores.add(scores);
|
||||
|
|
@ -383,7 +389,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
|
|||
|
||||
// indel likelihoods are stric log-probs, not phred scored
|
||||
double overallScore = 0.0;
|
||||
for ( final double[] readHaplotypeScores : haplotypeScores ) {
|
||||
for (final double[] readHaplotypeScores : haplotypeScores) {
|
||||
overallScore += MathUtils.arrayMin(readHaplotypeScores);
|
||||
}
|
||||
|
||||
|
|
@ -392,6 +398,11 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
|
|||
}
|
||||
|
||||
|
||||
public List<String> getKeyNames() { return Arrays.asList("HaplotypeScore"); }
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("HaplotypeScore", 1, VCFHeaderLineType.Float, "Consistency of the site with at most two segregating haplotypes")); }
|
||||
public List<String> getKeyNames() {
|
||||
return Arrays.asList("HaplotypeScore");
|
||||
}
|
||||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() {
|
||||
return Arrays.asList(new VCFInfoHeaderLine("HaplotypeScore", 1, VCFHeaderLineType.Float, "Consistency of the site with at most two segregating haplotypes"));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.samples.SampleDB;
|
|||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation;
|
||||
import org.broadinstitute.sting.utils.MendelianViolation;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
|
|
@ -21,9 +22,9 @@ import java.util.*;
|
|||
* User: chartl
|
||||
* Date: 9/14/11
|
||||
* Time: 12:24 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation {
|
||||
|
||||
public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation {
|
||||
|
||||
private MendelianViolation mendelianViolation = null;
|
||||
private String motherId;
|
||||
|
|
|
|||
|
|
@ -24,12 +24,12 @@ public class MappingQualityRankSumTest extends RankSumTest {
|
|||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MQRankSum", 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities")); }
|
||||
|
||||
protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals) {
|
||||
protected void fillQualsFromPileup(byte ref, List<Byte> alts, ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals) {
|
||||
for ( final PileupElement p : pileup ) {
|
||||
if ( isUsableBase(p) ) {
|
||||
if ( p.getBase() == ref ) {
|
||||
refQuals.add((double)p.getMappingQual());
|
||||
} else if ( p.getBase() == alt ) {
|
||||
} else if ( alts.contains(p.getBase()) ) {
|
||||
altQuals.add((double)p.getMappingQual());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ import org.broadinstitute.sting.utils.QualityUtils;
|
|||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
|
@ -30,32 +31,34 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar
|
|||
static final boolean DEBUG = false;
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
if ( stratifiedContexts.size() == 0 )
|
||||
if (stratifiedContexts.size() == 0)
|
||||
return null;
|
||||
|
||||
|
||||
final GenotypesContext genotypes = vc.getGenotypes();
|
||||
if ( genotypes == null || genotypes.size() == 0 )
|
||||
if (genotypes == null || genotypes.size() == 0)
|
||||
return null;
|
||||
|
||||
|
||||
final ArrayList<Double> refQuals = new ArrayList<Double>();
|
||||
final ArrayList<Double> altQuals = new ArrayList<Double>();
|
||||
|
||||
if (vc.isSNP() && vc.isBiallelic()) {
|
||||
// todo - no current support for multiallelic snps
|
||||
for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) {
|
||||
final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||
if ( context == null ) {
|
||||
continue;
|
||||
}
|
||||
fillQualsFromPileup(ref.getBase(), vc.getAlternateAllele(0).getBases()[0], context.getBasePileup(), refQuals, altQuals);
|
||||
}
|
||||
}
|
||||
else if (vc.isIndel() || vc.isMixed()) {
|
||||
if ( vc.isSNP() ) {
|
||||
final List<Byte> altAlleles = new ArrayList<Byte>();
|
||||
for ( final Allele a : vc.getAlternateAlleles() )
|
||||
altAlleles.add(a.getBases()[0]);
|
||||
|
||||
for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) {
|
||||
final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||
if ( context == null ) {
|
||||
if ( context == null )
|
||||
continue;
|
||||
|
||||
fillQualsFromPileup(ref.getBase(), altAlleles, context.getBasePileup(), refQuals, altQuals);
|
||||
}
|
||||
} else if ( vc.isIndel() || vc.isMixed() ) {
|
||||
|
||||
for (final Genotype genotype : genotypes.iterateInSampleNameOrder()) {
|
||||
final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||
if (context == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -74,46 +77,47 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar
|
|||
|
||||
fillIndelQualsFromPileup(pileup, refQuals, altQuals);
|
||||
}
|
||||
}
|
||||
else
|
||||
} else
|
||||
return null;
|
||||
|
||||
final MannWhitneyU mannWhitneyU = new MannWhitneyU();
|
||||
for ( final Double qual : altQuals ) {
|
||||
for (final Double qual : altQuals) {
|
||||
mannWhitneyU.add(qual, MannWhitneyU.USet.SET1);
|
||||
}
|
||||
for ( final Double qual : refQuals ) {
|
||||
for (final Double qual : refQuals) {
|
||||
mannWhitneyU.add(qual, MannWhitneyU.USet.SET2);
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.format("%s, REF QUALS:",this.getClass().getName());
|
||||
for ( final Double qual : refQuals )
|
||||
System.out.format("%4.1f ",qual);
|
||||
System.out.format("%s, REF QUALS:", this.getClass().getName());
|
||||
for (final Double qual : refQuals)
|
||||
System.out.format("%4.1f ", qual);
|
||||
System.out.println();
|
||||
System.out.format("%s, ALT QUALS:",this.getClass().getName());
|
||||
for ( final Double qual : altQuals )
|
||||
System.out.format("%4.1f ",qual);
|
||||
System.out.format("%s, ALT QUALS:", this.getClass().getName());
|
||||
for (final Double qual : altQuals)
|
||||
System.out.format("%4.1f ", qual);
|
||||
System.out.println();
|
||||
|
||||
}
|
||||
// we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases)
|
||||
final Pair<Double,Double> testResults = mannWhitneyU.runOneSidedTest( MannWhitneyU.USet.SET1 );
|
||||
final Pair<Double, Double> testResults = mannWhitneyU.runOneSidedTest(MannWhitneyU.USet.SET1);
|
||||
|
||||
final Map<String, Object> map = new HashMap<String, Object>();
|
||||
if ( ! Double.isNaN(testResults.first) )
|
||||
if (!Double.isNaN(testResults.first))
|
||||
map.put(getKeyNames().get(0), String.format("%.3f", testResults.first));
|
||||
return map;
|
||||
|
||||
}
|
||||
|
||||
protected abstract void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals);
|
||||
protected abstract void fillQualsFromPileup(byte ref, List<Byte> alts, ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals);
|
||||
|
||||
protected abstract void fillIndelQualsFromPileup(ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals);
|
||||
|
||||
protected static boolean isUsableBase( final PileupElement p ) {
|
||||
return !( p.isDeletion() ||
|
||||
p.getMappingQual() == 0 ||
|
||||
p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ||
|
||||
((int)p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE ); // need the unBAQed quality score here
|
||||
protected static boolean isUsableBase(final PileupElement p) {
|
||||
return !(p.isInsertionAtBeginningOfRead() ||
|
||||
p.isDeletion() ||
|
||||
p.getMappingQual() == 0 ||
|
||||
p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ||
|
||||
((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); // need the unBAQed quality score here
|
||||
}
|
||||
}
|
||||
|
|
@ -24,27 +24,31 @@ import java.util.List;
|
|||
*/
|
||||
public class ReadPosRankSumTest extends RankSumTest {
|
||||
|
||||
public List<String> getKeyNames() { return Arrays.asList("ReadPosRankSum"); }
|
||||
public List<String> getKeyNames() {
|
||||
return Arrays.asList("ReadPosRankSum");
|
||||
}
|
||||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("ReadPosRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias")); }
|
||||
public List<VCFInfoHeaderLine> getDescriptions() {
|
||||
return Arrays.asList(new VCFInfoHeaderLine("ReadPosRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias"));
|
||||
}
|
||||
|
||||
protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals) {
|
||||
for ( final PileupElement p : pileup ) {
|
||||
if( isUsableBase(p) ) {
|
||||
int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p.getOffset(), 0, 0);
|
||||
protected void fillQualsFromPileup(byte ref, List<Byte> alts, ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals) {
|
||||
for (final PileupElement p : pileup) {
|
||||
if (isUsableBase(p)) {
|
||||
int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0);
|
||||
final int numAlignedBases = AlignmentUtils.getNumAlignedBases(p.getRead());
|
||||
if( readPos > numAlignedBases / 2 ) {
|
||||
readPos = numAlignedBases - ( readPos + 1 );
|
||||
}
|
||||
if (readPos > numAlignedBases / 2)
|
||||
readPos = numAlignedBases - (readPos + 1);
|
||||
|
||||
if ( p.getBase() == ref ) {
|
||||
refQuals.add( (double)readPos );
|
||||
} else if ( p.getBase() == alt ) {
|
||||
altQuals.add( (double)readPos );
|
||||
}
|
||||
|
||||
if ( p.getBase() == ref )
|
||||
refQuals.add((double) readPos);
|
||||
else if ( alts.contains(p.getBase()) )
|
||||
altQuals.add((double) readPos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals) {
|
||||
// equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele
|
||||
// to classify a pileup element as ref or alt, we look at the likelihood associated with the allele associated to this element.
|
||||
|
|
@ -52,18 +56,15 @@ public class ReadPosRankSumTest extends RankSumTest {
|
|||
// To classify a pileup element as Ref or Alt, we look at the likelihood of corresponding alleles.
|
||||
// If likelihood of ref allele > highest likelihood of all alt alleles + epsilon, then this pielup element is "ref"
|
||||
// otherwise if highest alt allele likelihood is > ref likelihood + epsilon, then this pileup element it "alt"
|
||||
final HashMap<PileupElement,LinkedHashMap<Allele,Double>> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap();
|
||||
for (final PileupElement p: pileup) {
|
||||
final HashMap<PileupElement, LinkedHashMap<Allele, Double>> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap();
|
||||
for (final PileupElement p : pileup) {
|
||||
if (indelLikelihoodMap.containsKey(p)) {
|
||||
// retrieve likelihood information corresponding to this read
|
||||
LinkedHashMap<Allele,Double> el = indelLikelihoodMap.get(p);
|
||||
// by design, first element in LinkedHashMap was ref allele
|
||||
double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY;
|
||||
LinkedHashMap<Allele, Double> el = indelLikelihoodMap.get(p); // retrieve likelihood information corresponding to this read
|
||||
double refLikelihood = 0.0, altLikelihood = Double.NEGATIVE_INFINITY; // by design, first element in LinkedHashMap was ref allele
|
||||
|
||||
for (Allele a : el.keySet()) {
|
||||
|
||||
if (a.isReference())
|
||||
refLikelihood =el.get(a);
|
||||
refLikelihood = el.get(a);
|
||||
else {
|
||||
double like = el.get(a);
|
||||
if (like >= altLikelihood)
|
||||
|
|
@ -75,23 +76,22 @@ public class ReadPosRankSumTest extends RankSumTest {
|
|||
final int numAlignedBases = getNumAlignedBases(p.getRead());
|
||||
|
||||
int rp = readPos;
|
||||
if( readPos > numAlignedBases / 2 ) {
|
||||
readPos = numAlignedBases - ( readPos + 1 );
|
||||
if (readPos > numAlignedBases / 2) {
|
||||
readPos = numAlignedBases - (readPos + 1);
|
||||
}
|
||||
//if (DEBUG) System.out.format("R:%s start:%d C:%s offset:%d rp:%d readPos:%d alignedB:%d\n",p.getRead().getReadName(),p.getRead().getAlignmentStart(),p.getRead().getCigarString(),p.getOffset(), rp, readPos, numAlignedBases);
|
||||
//if (DEBUG) System.out.format("R:%s start:%d C:%s offset:%d rp:%d readPos:%d alignedB:%d\n",p.getRead().getReadName(),p.getRead().getAlignmentStart(),p.getRead().getCigarString(),p.getOffset(), rp, readPos, numAlignedBases);
|
||||
|
||||
|
||||
// if event is beyond span of read just return and don't consider this element. This can happen, for example, with reads
|
||||
// where soft clipping still left strings of low quality bases but these are later removed by indel-specific clipping.
|
||||
// if (readPos < -1)
|
||||
// if (readPos < -1)
|
||||
// return;
|
||||
if (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH)) {
|
||||
refQuals.add((double)readPos);
|
||||
if (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH)) {
|
||||
refQuals.add((double) readPos);
|
||||
//if (DEBUG) System.out.format("REF like: %4.1f, pos: %d\n",refLikelihood,readPos);
|
||||
}
|
||||
else if (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH)) {
|
||||
altQuals.add((double)readPos);
|
||||
//if (DEBUG) System.out.format("ALT like: %4.1f, pos: %d\n",refLikelihood,readPos);
|
||||
} else if (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH)) {
|
||||
altQuals.add((double) readPos);
|
||||
//if (DEBUG) System.out.format("ALT like: %4.1f, pos: %d\n",refLikelihood,readPos);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -115,7 +115,7 @@ public class ReadPosRankSumTest extends RankSumTest {
|
|||
|
||||
// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
|
||||
// and may leave a string of Q2 bases still hanging off the reads.
|
||||
for (int i=numStartClippedBases; i < unclippedReadBases.length; i++) {
|
||||
for (int i = numStartClippedBases; i < unclippedReadBases.length; i++) {
|
||||
if (unclippedReadQuals[i] < PairHMMIndelErrorModel.BASE_QUAL_THRESHOLD)
|
||||
numStartClippedBases++;
|
||||
else
|
||||
|
|
@ -134,7 +134,7 @@ public class ReadPosRankSumTest extends RankSumTest {
|
|||
// compute total number of clipped bases (soft or hard clipped)
|
||||
// check for hard clips (never consider these bases):
|
||||
final Cigar c = read.getCigar();
|
||||
CigarElement last = c.getCigarElement(c.numCigarElements()-1);
|
||||
CigarElement last = c.getCigarElement(c.numCigarElements() - 1);
|
||||
|
||||
int numEndClippedBases = 0;
|
||||
if (last.getOperator() == CigarOperator.H) {
|
||||
|
|
@ -145,7 +145,7 @@ public class ReadPosRankSumTest extends RankSumTest {
|
|||
|
||||
// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
|
||||
// and may leave a string of Q2 bases still hanging off the reads.
|
||||
for (int i=unclippedReadBases.length-numEndClippedBases-1; i >= 0; i-- ){
|
||||
for (int i = unclippedReadBases.length - numEndClippedBases - 1; i >= 0; i--) {
|
||||
if (unclippedReadQuals[i] < PairHMMIndelErrorModel.BASE_QUAL_THRESHOLD)
|
||||
numEndClippedBases++;
|
||||
else
|
||||
|
|
@ -157,8 +157,6 @@ public class ReadPosRankSumTest extends RankSumTest {
|
|||
}
|
||||
|
||||
int getOffsetFromClippedReadStart(SAMRecord read, int offset) {
|
||||
|
||||
|
||||
return offset - getNumClippedBasesAtStart(read);
|
||||
return offset - getNumClippedBasesAtStart(read);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,8 +7,9 @@ import org.broadinstitute.sting.gatk.samples.Sample;
|
|||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.MendelianViolation;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -18,16 +19,14 @@ import java.util.*;
|
|||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* User: rpoplin, lfran, ebanks
|
||||
* Date: 11/14/11
|
||||
*/
|
||||
|
||||
public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation {
|
||||
public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation {
|
||||
|
||||
private Set<Sample> trios = null;
|
||||
private final static int REF = 0;
|
||||
private final static int HET = 1;
|
||||
private final static int HOM = 2;
|
||||
private final static int MIN_NUM_VALID_TRIOS = 5; // don't calculate this population-level statistic if there are less than X trios with full genotype likelihood information
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
if ( trios == null ) {
|
||||
|
|
@ -38,10 +37,10 @@ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implemen
|
|||
}
|
||||
}
|
||||
|
||||
final Map<String,Object> toRet = new HashMap<String,Object>(1);
|
||||
final Map<String, Object> toRet = new HashMap<String, Object>(1);
|
||||
final HashSet<Sample> triosToTest = new HashSet<Sample>();
|
||||
|
||||
for( final Sample child : trios) {
|
||||
for( final Sample child : trios ) {
|
||||
final boolean hasAppropriateGenotypes = vc.hasGenotype(child.getID()) && vc.getGenotype(child.getID()).hasLikelihoods() &&
|
||||
vc.hasGenotype(child.getPaternalID()) && vc.getGenotype(child.getPaternalID()).hasLikelihoods() &&
|
||||
vc.hasGenotype(child.getMaternalID()) && vc.getGenotype(child.getMaternalID()).hasLikelihoods();
|
||||
|
|
@ -50,7 +49,9 @@ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implemen
|
|||
}
|
||||
}
|
||||
|
||||
toRet.put("TDT", calculateTDT( vc, triosToTest ));
|
||||
if( triosToTest.size() >= MIN_NUM_VALID_TRIOS ) {
|
||||
toRet.put("TDT", calculateTDT( vc, triosToTest ));
|
||||
}
|
||||
|
||||
return toRet;
|
||||
}
|
||||
|
|
@ -58,33 +59,52 @@ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implemen
|
|||
// return the descriptions used for the VCF INFO meta field
|
||||
public List<String> getKeyNames() { return Arrays.asList("TDT"); }
|
||||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("TDT", 1, VCFHeaderLineType.Float, "Test statistic from Wittkowski transmission disequilibrium test.")); }
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("TDT", VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Test statistic from Wittkowski transmission disequilibrium test.")); }
|
||||
|
||||
// Following derivation in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT
|
||||
private double calculateTDT( final VariantContext vc, final Set<Sample> triosToTest ) {
|
||||
private List<Double> calculateTDT( final VariantContext vc, final Set<Sample> triosToTest ) {
|
||||
|
||||
final double nABGivenABandBB = calculateNChildren(vc, triosToTest, HET, HET, HOM) + calculateNChildren(vc, triosToTest, HET, HOM, HET);
|
||||
final double nBBGivenABandBB = calculateNChildren(vc, triosToTest, HOM, HET, HOM) + calculateNChildren(vc, triosToTest, HOM, HOM, HET);
|
||||
final double nAAGivenABandAB = calculateNChildren(vc, triosToTest, REF, HET, HET);
|
||||
final double nBBGivenABandAB = calculateNChildren(vc, triosToTest, HOM, HET, HET);
|
||||
final double nAAGivenAAandAB = calculateNChildren(vc, triosToTest, REF, REF, HET) + calculateNChildren(vc, triosToTest, REF, HET, REF);
|
||||
final double nABGivenAAandAB = calculateNChildren(vc, triosToTest, HET, REF, HET) + calculateNChildren(vc, triosToTest, HET, HET, REF);
|
||||
List<Double> pairwiseTDTs = new ArrayList<Double>(10);
|
||||
final int HomRefIndex = 0;
|
||||
|
||||
final double numer = (nABGivenABandBB - nBBGivenABandBB) + 2.0 * (nAAGivenABandAB - nBBGivenABandAB) + (nAAGivenAAandAB - nABGivenAAandAB);
|
||||
final double denom = (nABGivenABandBB + nBBGivenABandBB) + 4.0 * (nAAGivenABandAB + nBBGivenABandAB) + (nAAGivenAAandAB + nABGivenAAandAB);
|
||||
return (numer * numer) / denom;
|
||||
// for each pair of alleles, add the likelihoods
|
||||
int numAltAlleles = vc.getAlternateAlleles().size();
|
||||
for ( int alt = 1; alt <= numAltAlleles; alt++ ) {
|
||||
final int HetIndex = alt;
|
||||
final int HomVarIndex = determineHomIndex(alt, numAltAlleles+1);
|
||||
|
||||
final double nABGivenABandBB = calculateNChildren(vc, triosToTest, HetIndex, HetIndex, HomVarIndex) + calculateNChildren(vc, triosToTest, HetIndex, HomVarIndex, HetIndex);
|
||||
final double nBBGivenABandBB = calculateNChildren(vc, triosToTest, HomVarIndex, HetIndex, HomVarIndex) + calculateNChildren(vc, triosToTest, HomVarIndex, HomVarIndex, HetIndex);
|
||||
final double nAAGivenABandAB = calculateNChildren(vc, triosToTest, HomRefIndex, HetIndex, HetIndex);
|
||||
final double nBBGivenABandAB = calculateNChildren(vc, triosToTest, HomVarIndex, HetIndex, HetIndex);
|
||||
final double nAAGivenAAandAB = calculateNChildren(vc, triosToTest, HomRefIndex, HomRefIndex, HetIndex) + calculateNChildren(vc, triosToTest, HomRefIndex, HetIndex, HomRefIndex);
|
||||
final double nABGivenAAandAB = calculateNChildren(vc, triosToTest, HetIndex, HomRefIndex, HetIndex) + calculateNChildren(vc, triosToTest, HetIndex, HetIndex, HomRefIndex);
|
||||
|
||||
final double numer = (nABGivenABandBB - nBBGivenABandBB) + 2.0 * (nAAGivenABandAB - nBBGivenABandAB) + (nAAGivenAAandAB - nABGivenAAandAB);
|
||||
final double denom = (nABGivenABandBB + nBBGivenABandBB) + 4.0 * (nAAGivenABandAB + nBBGivenABandAB) + (nAAGivenAAandAB + nABGivenAAandAB);
|
||||
pairwiseTDTs.add((numer * numer) / denom);
|
||||
}
|
||||
|
||||
return pairwiseTDTs;
|
||||
}
|
||||
|
||||
private double calculateNChildren( final VariantContext vc, final Set<Sample> triosToTest, final int childIdx, final int parent1Idx, final int parent2Idx ) {
|
||||
private double calculateNChildren( final VariantContext vc, final Set<Sample> triosToTest, final int childIdx, final int momIdx, final int dadIdx ) {
|
||||
final double likelihoodVector[] = new double[triosToTest.size()];
|
||||
int iii = 0;
|
||||
for( final Sample child : triosToTest ) {
|
||||
final double[] momGL = vc.getGenotype(child.getMaternalID()).getLikelihoods().getAsVector();
|
||||
final double[] dadGL = vc.getGenotype(child.getPaternalID()).getLikelihoods().getAsVector();
|
||||
final double[] childGL = vc.getGenotype(child.getID()).getLikelihoods().getAsVector();
|
||||
likelihoodVector[iii++] = momGL[parent1Idx] + dadGL[parent2Idx] + childGL[childIdx];
|
||||
likelihoodVector[iii++] = momGL[momIdx] + dadGL[dadIdx] + childGL[childIdx];
|
||||
}
|
||||
|
||||
return MathUtils.sumLog10(likelihoodVector);
|
||||
}
|
||||
|
||||
private static int determineHomIndex(final int alleleIndex, int numAlleles) {
|
||||
int result = 0;
|
||||
for ( int i = 0; i < alleleIndex; i++ )
|
||||
result += numAlleles--;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -32,7 +32,6 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.samples.SampleDB;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
|
@ -84,7 +83,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
|
||||
@ArgumentCollection
|
||||
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||
public RodBinding<VariantContext> getVariantRodBinding() { return variantCollection.variants; }
|
||||
|
||||
/**
|
||||
* The INFO field will be annotated with information on the most biologically-significant effect
|
||||
|
|
@ -163,6 +161,13 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
@Argument(fullName="list", shortName="ls", doc="List the available annotations and exit")
|
||||
protected Boolean LIST = false;
|
||||
|
||||
/**
|
||||
* By default, the dbSNP ID is added only when the ID field in the variant VCF is empty.
|
||||
*/
|
||||
@Argument(fullName="alwaysAppendDbsnpId", shortName="alwaysAppendDbsnpId", doc="In conjunction with the dbSNP binding, append the dbSNP ID even when the variant VCF already has the ID field populated")
|
||||
protected Boolean ALWAYS_APPEND_DBSNP_ID = false;
|
||||
public boolean alwaysAppendDbsnpId() { return ALWAYS_APPEND_DBSNP_ID; }
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false)
|
||||
protected boolean indelsOnly = false;
|
||||
|
|
|
|||
|
|
@ -195,11 +195,20 @@ public class VariantAnnotatorEngine {
|
|||
private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map<String, Object> infoAnnotations) {
|
||||
for ( Map.Entry<RodBinding<VariantContext>, String> dbSet : dbAnnotations.entrySet() ) {
|
||||
if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) {
|
||||
String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType());
|
||||
final String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType());
|
||||
|
||||
// put the DB key into the INFO field
|
||||
infoAnnotations.put(VCFConstants.DBSNP_KEY, rsID != null);
|
||||
// annotate dbsnp id if available and not already there
|
||||
if ( rsID != null && vc.emptyID() )
|
||||
vc = new VariantContextBuilder(vc).id(rsID).make();
|
||||
|
||||
// add the ID if appropriate
|
||||
if ( rsID != null ) {
|
||||
if ( vc.emptyID() ) {
|
||||
vc = new VariantContextBuilder(vc).id(rsID).make();
|
||||
} else if ( walker.alwaysAppendDbsnpId() && vc.getID().indexOf(rsID) == -1 ) {
|
||||
final String newRsID = vc.getID() + VCFConstants.ID_FIELD_SEPARATOR + rsID;
|
||||
vc = new VariantContextBuilder(vc).id(newRsID).make();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
boolean overlapsComp = false;
|
||||
for ( VariantContext comp : tracker.getValues(dbSet.getKey(), ref.getLocus()) ) {
|
||||
|
|
|
|||
|
|
@ -8,9 +8,9 @@ import java.util.List;
|
|||
public interface AnnotatorCompatibleWalker {
|
||||
|
||||
// getter methods for various used bindings
|
||||
public abstract RodBinding<VariantContext> getVariantRodBinding();
|
||||
public abstract RodBinding<VariantContext> getSnpEffRodBinding();
|
||||
public abstract RodBinding<VariantContext> getDbsnpRodBinding();
|
||||
public abstract List<RodBinding<VariantContext>> getCompRodBindings();
|
||||
public abstract List<RodBinding<VariantContext>> getResourceRodBindings();
|
||||
public abstract boolean alwaysAppendDbsnpId();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -241,6 +241,11 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
String alleleA = beagleGenotypePairs.get(0);
|
||||
String alleleB = beagleGenotypePairs.get(1);
|
||||
|
||||
if ( alleleA.equals("null") || alleleB.equals("null") ) {
|
||||
logger.warn("Beagle produced 'null' alleles at location "+ref.getLocus().toString()+". Ignoring.");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Beagle always produces genotype strings based on the strings we input in the likelihood file.
|
||||
String refString = vc_input.getReference().getDisplayString();
|
||||
if (refString.length() == 0) // ref was null
|
||||
|
|
@ -315,8 +320,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
og = a1+"/"+a2;
|
||||
|
||||
// See if Beagle switched genotypes
|
||||
if (!((bglAlleleA.equals(originalAlleleA) && bglAlleleB.equals(originalAlleleB) ||
|
||||
(bglAlleleA.equals(originalAlleleB) && bglAlleleB.equals(originalAlleleA))))){
|
||||
if (! originalAlleleA.equals(Allele.NO_CALL) && beagleSwitchedGenotypes(bglAlleleA,originalAlleleA,bglAlleleB,originalAlleleB)){
|
||||
originalAttributes.put("OG",og);
|
||||
numGenotypesChangedByBeagle++;
|
||||
}
|
||||
|
|
@ -359,6 +363,11 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
return 1;
|
||||
}
|
||||
|
||||
private boolean beagleSwitchedGenotypes(Allele bglAlleleA, Allele originalAlleleA, Allele bglAlleleB, Allele originalAlleleB) {
|
||||
return !((bglAlleleA.equals(originalAlleleA) && bglAlleleB.equals(originalAlleleB) ||
|
||||
(bglAlleleA.equals(originalAlleleB) && bglAlleleB.equals(originalAlleleA))));
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 0; // Nothing to do here
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,124 @@
|
|||
/*
|
||||
* Copyright (c) 2011 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.BitSet;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 9/26/11
|
||||
*/
|
||||
|
||||
public class ContextCovariate implements StandardCovariate {
|
||||
|
||||
private int mismatchesContextSize;
|
||||
private int insertionsContextSize;
|
||||
private int deletionsContextSize;
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
@Override
|
||||
public void initialize(final RecalibrationArgumentCollection RAC) {
|
||||
mismatchesContextSize = RAC.MISMATCHES_CONTEXT_SIZE;
|
||||
insertionsContextSize = RAC.INSERTIONS_CONTEXT_SIZE;
|
||||
deletionsContextSize = RAC.DELETIONS_CONTEXT_SIZE;
|
||||
|
||||
if (mismatchesContextSize <= 0 || insertionsContextSize <= 0 || deletionsContextSize <= 0)
|
||||
throw new UserException(String.format("Context Size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Insertions: %d Deletions:%d", mismatchesContextSize, insertionsContextSize, deletionsContextSize));
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public CovariateValues getValues(final GATKSAMRecord read) {
|
||||
int l = read.getReadLength();
|
||||
BitSet[] mismatches = new BitSet[l];
|
||||
BitSet[] insertions = new BitSet[l];
|
||||
BitSet[] deletions = new BitSet[l];
|
||||
|
||||
final boolean negativeStrand = read.getReadNegativeStrandFlag();
|
||||
byte[] bases = read.getReadBases();
|
||||
if (negativeStrand)
|
||||
bases = BaseUtils.simpleReverseComplement(bases);
|
||||
|
||||
for (int i = 0; i < read.getReadLength(); i++) {
|
||||
mismatches[i] = contextWith(bases, i, mismatchesContextSize);
|
||||
insertions[i] = contextWith(bases, i, insertionsContextSize);
|
||||
deletions[i] = contextWith(bases, i, deletionsContextSize);
|
||||
}
|
||||
|
||||
if (negativeStrand) {
|
||||
reverse(mismatches);
|
||||
reverse(insertions);
|
||||
reverse(deletions);
|
||||
}
|
||||
return new CovariateValues(mismatches, insertions, deletions);
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file during on-the-fly recalibration
|
||||
@Override
|
||||
public final Object getValue(final String str) {
|
||||
return str;
|
||||
}
|
||||
|
||||
/**
|
||||
* calculates the context of a base independent of the covariate mode
|
||||
*
|
||||
* @param bases the bases in the read to build the context from
|
||||
* @param offset the position in the read to calculate the context for
|
||||
* @param contextSize context size to use building the context
|
||||
* @return
|
||||
*/
|
||||
private BitSet contextWith(byte [] bases, int offset, int contextSize) {
|
||||
if (offset < contextSize)
|
||||
return null;
|
||||
|
||||
String context = new String(Arrays.copyOfRange(bases, offset - contextSize, offset));
|
||||
if (context.contains("N"))
|
||||
return null;
|
||||
|
||||
return MathUtils.bitSetFrom(context);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reverses the given array in place.
|
||||
*
|
||||
* @param array any array
|
||||
*/
|
||||
private static void reverse(final Object[] array) {
|
||||
final int arrayLength = array.length;
|
||||
for (int l = 0, r = arrayLength - 1; l < r; l++, r--) {
|
||||
final Object temp = array[l];
|
||||
array[l] = array[r];
|
||||
array[r] = temp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,63 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Oct 30, 2009
|
||||
*
|
||||
* The Covariate interface. A Covariate is a feature used in the recalibration that can be picked out of the read.
|
||||
* In general most error checking and adjustments to the data are done before the call to the covariates getValue methods in order to speed up the code.
|
||||
* This unfortunately muddies the code, but most of these corrections can be done per read while the covariates get called per base, resulting in a big speed up.
|
||||
*/
|
||||
|
||||
public interface Covariate {
|
||||
/**
|
||||
* Initialize any member variables using the command-line arguments passed to the walker
|
||||
*
|
||||
* @param RAC the recalibration argument collection
|
||||
*/
|
||||
public void initialize(RecalibrationArgumentCollection RAC);
|
||||
|
||||
/**
|
||||
* Calculates covariate values for all positions in the read.
|
||||
*
|
||||
* @param read the read to calculate the covariates on.
|
||||
* @return all the covariate values for every base in the read.
|
||||
*/
|
||||
public CovariateValues getValues(GATKSAMRecord read);
|
||||
|
||||
public Object getValue(String str); // Used to get the covariate's value from input csv file during on-the-fly recalibration
|
||||
}
|
||||
|
||||
interface RequiredCovariate extends Covariate {}
|
||||
|
||||
interface StandardCovariate extends Covariate {}
|
||||
|
||||
interface ExperimentalCovariate extends Covariate {}
|
||||
|
|
@ -0,0 +1,88 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
/**
|
||||
* The object temporarily held by a read that describes all of it's covariates.
|
||||
*
|
||||
* In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 2/8/12
|
||||
*/
|
||||
public class CovariateKeySet {
|
||||
private Object[][] mismatchesKeySet;
|
||||
private Object[][] insertionsKeySet;
|
||||
private Object[][] deletionsKeySet;
|
||||
|
||||
private int nextCovariateIndex;
|
||||
|
||||
private static String mismatchesCovariateName = "M";
|
||||
private static String insertionsCovariateName = "I";
|
||||
private static String deletionsCovariateName = "D";
|
||||
|
||||
public CovariateKeySet(int readLength, int numberOfCovariates) {
|
||||
numberOfCovariates++; // +1 because we are adding the mismatch covariate (to comply with the molten table format)
|
||||
this.mismatchesKeySet = new Object[readLength][numberOfCovariates];
|
||||
this.insertionsKeySet = new Object[readLength][numberOfCovariates];
|
||||
this.deletionsKeySet = new Object[readLength][numberOfCovariates];
|
||||
initializeCovariateKeySet(this.mismatchesKeySet, mismatchesCovariateName);
|
||||
initializeCovariateKeySet(this.insertionsKeySet, insertionsCovariateName);
|
||||
initializeCovariateKeySet(this.deletionsKeySet, deletionsCovariateName);
|
||||
this.nextCovariateIndex = 0;
|
||||
}
|
||||
|
||||
public void addCovariate(CovariateValues covariate) {
|
||||
transposeCovariateValues(mismatchesKeySet, covariate.getMismatches());
|
||||
transposeCovariateValues(insertionsKeySet, covariate.getInsertions());
|
||||
transposeCovariateValues(deletionsKeySet, covariate.getDeletions());
|
||||
nextCovariateIndex++;
|
||||
}
|
||||
|
||||
public static RecalDataManager.BaseRecalibrationType getErrorModelFromString(final String modelString) {
|
||||
if (modelString.equals(mismatchesCovariateName))
|
||||
return RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION;
|
||||
else if (modelString.equals(insertionsCovariateName))
|
||||
return RecalDataManager.BaseRecalibrationType.BASE_INSERTION;
|
||||
else if (modelString.equals(deletionsCovariateName))
|
||||
return RecalDataManager.BaseRecalibrationType.BASE_DELETION;
|
||||
throw new ReviewedStingException("Unrecognized Base Recalibration model string: " + modelString);
|
||||
}
|
||||
|
||||
public Object[] getKeySet(final int readPosition, final RecalDataManager.BaseRecalibrationType errorModel) {
|
||||
switch (errorModel) {
|
||||
case BASE_SUBSTITUTION:
|
||||
return getMismatchesKeySet(readPosition);
|
||||
case BASE_INSERTION:
|
||||
return getInsertionsKeySet(readPosition);
|
||||
case BASE_DELETION:
|
||||
return getDeletionsKeySet(readPosition);
|
||||
default:
|
||||
throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel );
|
||||
}
|
||||
}
|
||||
|
||||
public Object[] getMismatchesKeySet(int readPosition) {
|
||||
return mismatchesKeySet[readPosition];
|
||||
}
|
||||
|
||||
public Object[] getInsertionsKeySet(int readPosition) {
|
||||
return insertionsKeySet[readPosition];
|
||||
}
|
||||
|
||||
public Object[] getDeletionsKeySet(int readPosition) {
|
||||
return deletionsKeySet[readPosition];
|
||||
}
|
||||
|
||||
private void transposeCovariateValues (Object [][] keySet, Object [] covariateValues) {
|
||||
for (int i=0; i<covariateValues.length; i++)
|
||||
keySet[i][nextCovariateIndex] = covariateValues[i];
|
||||
}
|
||||
|
||||
private void initializeCovariateKeySet (Object[][] keySet, String covariateName) {
|
||||
int readLength = keySet.length;
|
||||
int lastCovariateIndex = keySet[0].length - 1;
|
||||
for (int i = 0; i < readLength; i++)
|
||||
keySet[i][lastCovariateIndex] = covariateName;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
/**
|
||||
* An object to hold the different covariate values for all bases in the read.
|
||||
*
|
||||
* Currently we have three different covariates for each read:
|
||||
* - Mismatch
|
||||
* - Insertion
|
||||
* - Deletion
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 2/8/12
|
||||
*/
|
||||
public class CovariateValues {
|
||||
private Object[] mismatches;
|
||||
private Object[] insertions;
|
||||
private Object[] deletions;
|
||||
|
||||
public CovariateValues(Object[] mismatch, Object[] insertion, Object[] deletion) {
|
||||
this.mismatches = mismatch;
|
||||
this.insertions = insertion;
|
||||
this.deletions = deletion;
|
||||
}
|
||||
|
||||
public Object[] getMismatches() {
|
||||
return mismatches;
|
||||
}
|
||||
|
||||
public Object[] getInsertions() {
|
||||
return insertions;
|
||||
}
|
||||
|
||||
public Object[] getDeletions() {
|
||||
return deletions;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,204 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.NGSPlatform;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.EnumSet;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Oct 30, 2009
|
||||
*
|
||||
* The Cycle covariate.
|
||||
* For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read)
|
||||
* For 454 the cycle is the TACG flow cycle, that is, each flow grabs all the TACG's in order in a single cycle
|
||||
* For example, for the read: AAACCCCGAAATTTTTACTG
|
||||
* the cycle would be 11111111222333333344
|
||||
* For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round
|
||||
*/
|
||||
|
||||
public class CycleCovariate implements StandardCovariate {
|
||||
private final static EnumSet<NGSPlatform> DISCRETE_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.ILLUMINA, NGSPlatform.SOLID, NGSPlatform.PACBIO, NGSPlatform.COMPLETE_GENOMICS);
|
||||
private final static EnumSet<NGSPlatform> FLOW_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.LS454, NGSPlatform.ION_TORRENT);
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
@Override
|
||||
public void initialize(final RecalibrationArgumentCollection RAC) {
|
||||
if (RAC.DEFAULT_PLATFORM != null && !NGSPlatform.isKnown(RAC.DEFAULT_PLATFORM))
|
||||
throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM + ") is not a recognized platform.");
|
||||
}
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
@Override
|
||||
public CovariateValues getValues(final GATKSAMRecord read) {
|
||||
Integer [] cycles = new Integer[read.getReadLength()];
|
||||
final NGSPlatform ngsPlatform = read.getNGSPlatform();
|
||||
|
||||
// Discrete cycle platforms
|
||||
if (DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform)) {
|
||||
final int init;
|
||||
final int increment;
|
||||
if (!read.getReadNegativeStrandFlag()) {
|
||||
// Differentiate between first and second of pair.
|
||||
// The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group
|
||||
// to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair.
|
||||
// Therefore the cycle covariate must differentiate between first and second of pair reads.
|
||||
// This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because
|
||||
// the current sequential model would consider the effects independently instead of jointly.
|
||||
if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) {
|
||||
//second of pair, positive strand
|
||||
init = -1;
|
||||
increment = -1;
|
||||
}
|
||||
else {
|
||||
//first of pair, positive strand
|
||||
init = 1;
|
||||
increment = 1;
|
||||
}
|
||||
|
||||
}
|
||||
else {
|
||||
if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) {
|
||||
//second of pair, negative strand
|
||||
init = -read.getReadLength();
|
||||
increment = 1;
|
||||
}
|
||||
else {
|
||||
//first of pair, negative strand
|
||||
init = read.getReadLength();
|
||||
increment = -1;
|
||||
}
|
||||
}
|
||||
|
||||
int cycle = init;
|
||||
for (int i = 0; i < read.getReadLength(); i++) {
|
||||
cycles[i] = cycle;
|
||||
cycle += increment;
|
||||
}
|
||||
}
|
||||
|
||||
// Flow cycle platforms
|
||||
else if (FLOW_CYCLE_PLATFORMS.contains(ngsPlatform)) {
|
||||
|
||||
final int readLength = read.getReadLength();
|
||||
final byte[] bases = read.getReadBases();
|
||||
|
||||
// Differentiate between first and second of pair.
|
||||
// The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group
|
||||
// to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair.
|
||||
// Therefore the cycle covariate must differentiate between first and second of pair reads.
|
||||
// This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because
|
||||
// the current sequential model would consider the effects independently instead of jointly.
|
||||
final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag();
|
||||
|
||||
int cycle = multiplyByNegative1 ? -1 : 1;
|
||||
|
||||
// BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change
|
||||
// For example, AAAAAAA was probably read in two flow cycles but here we count it as one
|
||||
if (!read.getReadNegativeStrandFlag()) { // Forward direction
|
||||
int iii = 0;
|
||||
while (iii < readLength) {
|
||||
while (iii < readLength && bases[iii] == (byte) 'T') {
|
||||
cycles[iii] = cycle;
|
||||
iii++;
|
||||
}
|
||||
while (iii < readLength && bases[iii] == (byte) 'A') {
|
||||
cycles[iii] = cycle;
|
||||
iii++;
|
||||
}
|
||||
while (iii < readLength && bases[iii] == (byte) 'C') {
|
||||
cycles[iii] = cycle;
|
||||
iii++;
|
||||
}
|
||||
while (iii < readLength && bases[iii] == (byte) 'G') {
|
||||
cycles[iii] = cycle;
|
||||
iii++;
|
||||
}
|
||||
if (iii < readLength) {
|
||||
if (multiplyByNegative1)
|
||||
cycle--;
|
||||
else
|
||||
cycle++;
|
||||
}
|
||||
if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) {
|
||||
cycles[iii] = cycle;
|
||||
iii++;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
else { // Negative direction
|
||||
int iii = readLength - 1;
|
||||
while (iii >= 0) {
|
||||
while (iii >= 0 && bases[iii] == (byte) 'T') {
|
||||
cycles[iii] = cycle;
|
||||
iii--;
|
||||
}
|
||||
while (iii >= 0 && bases[iii] == (byte) 'A') {
|
||||
cycles[iii] = cycle;
|
||||
iii--;
|
||||
}
|
||||
while (iii >= 0 && bases[iii] == (byte) 'C') {
|
||||
cycles[iii] = cycle;
|
||||
iii--;
|
||||
}
|
||||
while (iii >= 0 && bases[iii] == (byte) 'G') {
|
||||
cycles[iii] = cycle;
|
||||
iii--;
|
||||
}
|
||||
if (iii >= 0) {
|
||||
if (multiplyByNegative1)
|
||||
cycle--;
|
||||
else
|
||||
cycle++;
|
||||
}
|
||||
if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) {
|
||||
cycles[iii] = cycle;
|
||||
iii--;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Unknown platforms
|
||||
else {
|
||||
throw new UserException("The platform (" + read.getReadGroup().getPlatform() + ") associated with read group " + read.getReadGroup() + " is not a recognized platform. Implemented options are e.g. illumina, 454, and solid");
|
||||
}
|
||||
|
||||
return new CovariateValues(cycles, cycles, cycles);
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file during on-the-fly recalibration
|
||||
@Override
|
||||
public final Object getValue(final String str) {
|
||||
return Integer.parseInt(str);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,71 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Nov 3, 2009
|
||||
*
|
||||
* The Reported Quality Score covariate.
|
||||
*/
|
||||
|
||||
public class QualityScoreCovariate implements RequiredCovariate {
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
@Override
|
||||
public void initialize(final RecalibrationArgumentCollection RAC) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public CovariateValues getValues(final GATKSAMRecord read) {
|
||||
int readLength = read.getReadLength();
|
||||
|
||||
Integer [] mismatches = new Integer[readLength];
|
||||
Integer [] insertions = new Integer[readLength];
|
||||
Integer [] deletions = new Integer[readLength];
|
||||
|
||||
byte [] baseQualities = read.getBaseQualities();
|
||||
byte [] baseInsertionQualities = read.getBaseInsertionQualities();
|
||||
byte [] baseDeletionQualities = read.getBaseDeletionQualities();
|
||||
|
||||
for (int i=0; i<baseQualities.length; i++) {
|
||||
mismatches[i] = (int) baseQualities[i];
|
||||
insertions[i] = (int) baseInsertionQualities[i];
|
||||
deletions[i] = (int) baseDeletionQualities[i];
|
||||
}
|
||||
|
||||
return new CovariateValues(mismatches, insertions, deletions);
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file during on-the-fly recalibration
|
||||
@Override
|
||||
public final Object getValue(final String str) {
|
||||
return Integer.parseInt(str);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,81 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Oct 30, 2009
|
||||
*
|
||||
* The Read Group covariate.
|
||||
*/
|
||||
|
||||
public class ReadGroupCovariate implements RequiredCovariate {
|
||||
|
||||
private final HashMap<String, Short> readGroupLookupTable = new HashMap<String, Short>();
|
||||
private final HashMap<Short, String> readGroupReverseLookupTable = new HashMap<Short, String>();
|
||||
private short nextId = 0;
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
@Override
|
||||
public void initialize(final RecalibrationArgumentCollection RAC) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public CovariateValues getValues(final GATKSAMRecord read) {
|
||||
final int l = read.getReadLength();
|
||||
final String readGroupId = read.getReadGroup().getReadGroupId();
|
||||
short shortId;
|
||||
if (readGroupLookupTable.containsKey(readGroupId))
|
||||
shortId = readGroupLookupTable.get(readGroupId);
|
||||
else {
|
||||
shortId = nextId;
|
||||
readGroupLookupTable.put(readGroupId, nextId);
|
||||
readGroupReverseLookupTable.put(nextId, readGroupId);
|
||||
nextId++;
|
||||
}
|
||||
Short [] readGroups = new Short[l];
|
||||
Arrays.fill(readGroups, shortId);
|
||||
return new CovariateValues(readGroups, readGroups, readGroups);
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file during on-the-fly recalibration
|
||||
@Override
|
||||
public final Object getValue(final String str) {
|
||||
return str;
|
||||
}
|
||||
|
||||
public final String decodeReadGroup(final short id) {
|
||||
return readGroupReverseLookupTable.get(id);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,710 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import net.sf.samtools.SAMUtils;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Nov 6, 2009
|
||||
*
|
||||
* This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions.
|
||||
* It also has static methods that are used to perform the various solid recalibration modes that attempt to correct the reference bias.
|
||||
* This class holds the parsing methods that are shared between CountCovariates and TableRecalibration.
|
||||
*/
|
||||
|
||||
public class RecalDataManager {
|
||||
public final NestedHashMap nestedHashMap; // The full dataset
|
||||
private final HashMap<BaseRecalibrationType, NestedHashMap> dataCollapsedReadGroup; // Table where everything except read group has been collapsed
|
||||
private final HashMap<BaseRecalibrationType, NestedHashMap> dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
|
||||
private final HashMap<BaseRecalibrationType, ArrayList<NestedHashMap>> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
|
||||
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores
|
||||
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
|
||||
public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
|
||||
public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
|
||||
private static boolean warnUserNullPlatform = false;
|
||||
|
||||
private static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\
|
||||
|
||||
public enum BaseRecalibrationType {
|
||||
BASE_SUBSTITUTION,
|
||||
BASE_INSERTION,
|
||||
BASE_DELETION
|
||||
}
|
||||
|
||||
public enum SOLID_RECAL_MODE {
|
||||
/**
|
||||
* Treat reference inserted bases as reference matching bases. Very unsafe!
|
||||
*/
|
||||
DO_NOTHING,
|
||||
/**
|
||||
* Set reference inserted bases and the previous base (because of color space alignment details) to Q0. This is the default option.
|
||||
*/
|
||||
SET_Q_ZERO,
|
||||
/**
|
||||
* In addition to setting the quality scores to zero, also set the base itself to 'N'. This is useful to visualize in IGV.
|
||||
*/
|
||||
SET_Q_ZERO_BASE_N,
|
||||
/**
|
||||
* Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference.
|
||||
*/
|
||||
REMOVE_REF_BIAS
|
||||
}
|
||||
|
||||
public enum SOLID_NOCALL_STRATEGY {
|
||||
/**
|
||||
* When a no call is detected throw an exception to alert the user that recalibrating this SOLiD data is unsafe. This is the default option.
|
||||
*/
|
||||
THROW_EXCEPTION,
|
||||
/**
|
||||
* Leave the read in the output bam completely untouched. This mode is only okay if the no calls are very rare.
|
||||
*/
|
||||
LEAVE_READ_UNRECALIBRATED,
|
||||
/**
|
||||
* Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses.
|
||||
*/
|
||||
PURGE_READ
|
||||
}
|
||||
|
||||
public RecalDataManager() {
|
||||
nestedHashMap = new NestedHashMap();
|
||||
dataCollapsedReadGroup = null;
|
||||
dataCollapsedQualityScore = null;
|
||||
dataCollapsedByCovariate = null;
|
||||
}
|
||||
|
||||
public RecalDataManager(final boolean createCollapsedTables, final int numCovariates) {
|
||||
if (createCollapsedTables) { // Initialize all the collapsed tables, only used by on-the-fly recalibration
|
||||
nestedHashMap = null;
|
||||
dataCollapsedReadGroup = new HashMap<BaseRecalibrationType, NestedHashMap>();
|
||||
dataCollapsedQualityScore = new HashMap<BaseRecalibrationType, NestedHashMap>();
|
||||
dataCollapsedByCovariate = new HashMap<BaseRecalibrationType, ArrayList<NestedHashMap>>();
|
||||
for ( final BaseRecalibrationType errorModel : BaseRecalibrationType.values() ) {
|
||||
dataCollapsedReadGroup.put(errorModel, new NestedHashMap());
|
||||
dataCollapsedQualityScore.put(errorModel, new NestedHashMap());
|
||||
dataCollapsedByCovariate.put(errorModel, new ArrayList<NestedHashMap>());
|
||||
for (int iii = 0; iii < numCovariates - 2; iii++) { // readGroup and QualityScore aren't counted here, their tables are separate
|
||||
dataCollapsedByCovariate.get(errorModel).add(new NestedHashMap());
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
nestedHashMap = new NestedHashMap();
|
||||
dataCollapsedReadGroup = null;
|
||||
dataCollapsedQualityScore = null;
|
||||
dataCollapsedByCovariate = null;
|
||||
}
|
||||
}
|
||||
|
||||
public static CovariateKeySet getAllCovariateValuesFor(GATKSAMRecord read) {
|
||||
return (CovariateKeySet) read.getTemporaryAttribute(COVARS_ATTRIBUTE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the given mapping to all of the collapsed hash tables
|
||||
*
|
||||
* @param key The list of comparables that is the key for this mapping
|
||||
* @param fullDatum The RecalDatum which is the data for this mapping
|
||||
* @param PRESERVE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table
|
||||
*/
|
||||
public final void addToAllTables(final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN, final BaseRecalibrationType errorModel ) {
|
||||
|
||||
// The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around
|
||||
//data.put(key, thisDatum); // add the mapping to the main table
|
||||
|
||||
final int qualityScore = Integer.parseInt(key[1].toString());
|
||||
final Object[] readGroupCollapsedKey = new Object[1];
|
||||
final Object[] qualityScoreCollapsedKey = new Object[2];
|
||||
final Object[] covariateCollapsedKey = new Object[3];
|
||||
RecalDatum collapsedDatum;
|
||||
|
||||
// Create dataCollapsedReadGroup, the table where everything except read group has been collapsed
|
||||
if (qualityScore >= PRESERVE_QSCORES_LESS_THAN) {
|
||||
readGroupCollapsedKey[0] = key[0]; // Make a new key with just the read group
|
||||
collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get(errorModel).get(readGroupCollapsedKey);
|
||||
if (collapsedDatum == null) {
|
||||
dataCollapsedReadGroup.get(errorModel).put(new RecalDatum(fullDatum), readGroupCollapsedKey);
|
||||
}
|
||||
else {
|
||||
collapsedDatum.combine(fullDatum); // using combine instead of increment in order to calculate overall aggregateQReported
|
||||
}
|
||||
}
|
||||
|
||||
// Create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
|
||||
qualityScoreCollapsedKey[0] = key[0]; // Make a new key with the read group ...
|
||||
qualityScoreCollapsedKey[1] = key[1]; // and quality score
|
||||
collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get(errorModel).get(qualityScoreCollapsedKey);
|
||||
if (collapsedDatum == null) {
|
||||
dataCollapsedQualityScore.get(errorModel).put(new RecalDatum(fullDatum), qualityScoreCollapsedKey);
|
||||
}
|
||||
else {
|
||||
collapsedDatum.increment(fullDatum);
|
||||
}
|
||||
|
||||
// Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
for (int iii = 0; iii < dataCollapsedByCovariate.get(errorModel).size(); iii++) {
|
||||
covariateCollapsedKey[0] = key[0]; // Make a new key with the read group ...
|
||||
covariateCollapsedKey[1] = key[1]; // and quality score ...
|
||||
final Object theCovariateElement = key[iii + 2]; // and the given covariate
|
||||
if (theCovariateElement != null) {
|
||||
covariateCollapsedKey[2] = theCovariateElement;
|
||||
collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(errorModel).get(iii).get(covariateCollapsedKey);
|
||||
if (collapsedDatum == null) {
|
||||
dataCollapsedByCovariate.get(errorModel).get(iii).put(new RecalDatum(fullDatum), covariateCollapsedKey);
|
||||
}
|
||||
else {
|
||||
collapsedDatum.increment(fullDatum);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loop over all the collapsed tables and turn the recalDatums found there into an empirical quality score
|
||||
* that will be used in the sequential calculation in TableRecalibrationWalker
|
||||
*
|
||||
* @param smoothing The smoothing parameter that goes into empirical quality score calculation
|
||||
* @param maxQual At which value to cap the quality scores
|
||||
*/
|
||||
public final void generateEmpiricalQualities(final int smoothing, final int maxQual) {
|
||||
|
||||
for( final BaseRecalibrationType errorModel : BaseRecalibrationType.values() ) {
|
||||
recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.get(errorModel).data, smoothing, maxQual);
|
||||
recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.get(errorModel).data, smoothing, maxQual);
|
||||
for (NestedHashMap map : dataCollapsedByCovariate.get(errorModel)) {
|
||||
recursivelyGenerateEmpiricalQualities(map.data, smoothing, maxQual);
|
||||
checkForSingletons(map.data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void recursivelyGenerateEmpiricalQualities(final Map data, final int smoothing, final int maxQual) {
|
||||
|
||||
for (Object comp : data.keySet()) {
|
||||
final Object val = data.get(comp);
|
||||
if (val instanceof RecalDatum) { // We are at the end of the nested hash maps
|
||||
((RecalDatum) val).calcCombinedEmpiricalQuality(smoothing, maxQual);
|
||||
}
|
||||
else { // Another layer in the nested hash map
|
||||
recursivelyGenerateEmpiricalQualities((Map) val, smoothing, maxQual);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void checkForSingletons(final Map data) {
|
||||
// todo -- this looks like it's better just as a data.valueSet() call?
|
||||
for (Object comp : data.keySet()) {
|
||||
final Object val = data.get(comp);
|
||||
if (val instanceof RecalDatum) { // We are at the end of the nested hash maps
|
||||
if (data.keySet().size() == 1) {
|
||||
data.clear(); // don't TableRecalibrate a non-required covariate if it only has one element because that correction has already been done ...
|
||||
// in a previous step of the sequential calculation model
|
||||
}
|
||||
}
|
||||
else { // Another layer in the nested hash map
|
||||
checkForSingletons((Map) val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the appropriate collapsed table out of the set of all the tables held by this Object
|
||||
*
|
||||
* @param covariate Which covariate indexes the desired collapsed HashMap
|
||||
* @return The desired collapsed HashMap
|
||||
*/
|
||||
public final NestedHashMap getCollapsedTable(final int covariate, final BaseRecalibrationType errorModel) {
|
||||
if (covariate == 0) {
|
||||
return dataCollapsedReadGroup.get(errorModel); // Table where everything except read group has been collapsed
|
||||
}
|
||||
else if (covariate == 1) {
|
||||
return dataCollapsedQualityScore.get(errorModel); // Table where everything except read group and quality score has been collapsed
|
||||
}
|
||||
else {
|
||||
return dataCollapsedByCovariate.get(errorModel).get(covariate - 2); // Table where everything except read group, quality score, and given covariate has been collapsed
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string
|
||||
*
|
||||
* @param read The read to adjust
|
||||
* @param RAC The list of shared command line arguments
|
||||
*/
|
||||
public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) {
|
||||
GATKSAMReadGroupRecord readGroup = read.getReadGroup();
|
||||
|
||||
if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) {
|
||||
readGroup.setPlatform(RAC.FORCE_PLATFORM);
|
||||
}
|
||||
|
||||
if (readGroup.getPlatform() == null) {
|
||||
if (RAC.DEFAULT_PLATFORM != null) {
|
||||
if (!warnUserNullPlatform) {
|
||||
Utils.warnUser("The input .bam file contains reads with no platform information. " +
|
||||
"Defaulting to platform = " + RAC.DEFAULT_PLATFORM + ". " +
|
||||
"First observed at read with name = " + read.getReadName());
|
||||
warnUserNullPlatform = true;
|
||||
}
|
||||
readGroup.setPlatform(RAC.DEFAULT_PLATFORM);
|
||||
}
|
||||
else {
|
||||
throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse through the color space of the read and add a new tag to the SAMRecord that says which bases are inconsistent with the color space
|
||||
*
|
||||
* @param read The SAMRecord to parse
|
||||
*/
|
||||
public static void parseColorSpace(final GATKSAMRecord read) {
|
||||
|
||||
// If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base
|
||||
if (ReadUtils.isSOLiDRead(read)) {
|
||||
if (read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read
|
||||
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
|
||||
if (attr != null) {
|
||||
byte[] colorSpace;
|
||||
if (attr instanceof String) {
|
||||
colorSpace = ((String) attr).getBytes();
|
||||
}
|
||||
else {
|
||||
throw new UserException.MalformedBAM(read, String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
|
||||
// Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read
|
||||
byte[] readBases = read.getReadBases();
|
||||
if (read.getReadNegativeStrandFlag()) {
|
||||
readBases = BaseUtils.simpleReverseComplement(read.getReadBases());
|
||||
}
|
||||
final byte[] inconsistency = new byte[readBases.length];
|
||||
int iii;
|
||||
byte prevBase = colorSpace[0]; // The sentinel
|
||||
for (iii = 0; iii < readBases.length; iii++) {
|
||||
final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[iii + 1]);
|
||||
inconsistency[iii] = (byte) (thisBase == readBases[iii] ? 0 : 1);
|
||||
prevBase = readBases[iii];
|
||||
}
|
||||
read.setAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency);
|
||||
|
||||
}
|
||||
else {
|
||||
throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() +
|
||||
" Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse through the color space of the read and apply the desired --solid_recal_mode correction to the bases
|
||||
* This method doesn't add the inconsistent tag to the read like parseColorSpace does
|
||||
*
|
||||
* @param read The SAMRecord to parse
|
||||
* @param originalQualScores The array of original quality scores to modify during the correction
|
||||
* @param solidRecalMode Which mode of solid recalibration to apply
|
||||
* @param refBases The reference for this read
|
||||
* @return A new array of quality scores that have been ref bias corrected
|
||||
*/
|
||||
public static byte[] calcColorSpace(final GATKSAMRecord read, byte[] originalQualScores, final SOLID_RECAL_MODE solidRecalMode, final byte[] refBases) {
|
||||
|
||||
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
|
||||
if (attr != null) {
|
||||
byte[] colorSpace;
|
||||
if (attr instanceof String) {
|
||||
colorSpace = ((String) attr).getBytes();
|
||||
}
|
||||
else {
|
||||
throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
|
||||
// Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read
|
||||
byte[] readBases = read.getReadBases();
|
||||
final byte[] colorImpliedBases = readBases.clone();
|
||||
byte[] refBasesDirRead = AlignmentUtils.alignmentToByteArray(read.getCigar(), read.getReadBases(), refBases); //BUGBUG: This needs to change when read walkers are changed to give the aligned refBases
|
||||
if (read.getReadNegativeStrandFlag()) {
|
||||
readBases = BaseUtils.simpleReverseComplement(read.getReadBases());
|
||||
refBasesDirRead = BaseUtils.simpleReverseComplement(refBasesDirRead.clone());
|
||||
}
|
||||
final int[] inconsistency = new int[readBases.length];
|
||||
byte prevBase = colorSpace[0]; // The sentinel
|
||||
for (int iii = 0; iii < readBases.length; iii++) {
|
||||
final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[iii + 1]);
|
||||
colorImpliedBases[iii] = thisBase;
|
||||
inconsistency[iii] = (thisBase == readBases[iii] ? 0 : 1);
|
||||
prevBase = readBases[iii];
|
||||
}
|
||||
|
||||
// Now that we have the inconsistency array apply the desired correction to the inconsistent bases
|
||||
if (solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO) { // Set inconsistent bases and the one before it to Q0
|
||||
final boolean setBaseN = false;
|
||||
originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN);
|
||||
}
|
||||
else if (solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N) {
|
||||
final boolean setBaseN = true;
|
||||
originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN);
|
||||
}
|
||||
else if (solidRecalMode == SOLID_RECAL_MODE.REMOVE_REF_BIAS) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases
|
||||
solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead);
|
||||
}
|
||||
|
||||
}
|
||||
else {
|
||||
throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() +
|
||||
" Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias.");
|
||||
}
|
||||
|
||||
return originalQualScores;
|
||||
}
|
||||
|
||||
public static boolean checkNoCallColorSpace(final GATKSAMRecord read) {
|
||||
if (ReadUtils.isSOLiDRead(read)) {
|
||||
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
|
||||
if (attr != null) {
|
||||
byte[] colorSpace;
|
||||
if (attr instanceof String) {
|
||||
colorSpace = ((String) attr).substring(1).getBytes(); // trim off the Sentinel
|
||||
}
|
||||
else {
|
||||
throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
|
||||
for (byte color : colorSpace) {
|
||||
if (color != (byte) '0' && color != (byte) '1' && color != (byte) '2' && color != (byte) '3') {
|
||||
return true; // There is a bad color in this SOLiD read and the user wants to skip over it
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
else {
|
||||
throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() +
|
||||
" Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias.");
|
||||
}
|
||||
}
|
||||
|
||||
return false; // There aren't any color no calls in this SOLiD read
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform the SET_Q_ZERO solid recalibration. Inconsistent color space bases and their previous base are set to quality zero
|
||||
*
|
||||
* @param read The SAMRecord to recalibrate
|
||||
* @param readBases The bases in the read which have been RC'd if necessary
|
||||
* @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
|
||||
* @param originalQualScores The array of original quality scores to set to zero if needed
|
||||
* @param refBases The reference which has been RC'd if necessary
|
||||
* @param setBaseN Should we also set the base to N as well as quality zero in order to visualize in IGV or something similar
|
||||
* @return The byte array of original quality scores some of which might have been set to zero
|
||||
*/
|
||||
private static byte[] solidRecalSetToQZero(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] originalQualScores, final byte[] refBases, final boolean setBaseN) {
|
||||
|
||||
final boolean negStrand = read.getReadNegativeStrandFlag();
|
||||
for (int iii = 1; iii < originalQualScores.length; iii++) {
|
||||
if (inconsistency[iii] == 1) {
|
||||
if (readBases[iii] == refBases[iii]) {
|
||||
if (negStrand) {
|
||||
originalQualScores[originalQualScores.length - (iii + 1)] = (byte) 0;
|
||||
}
|
||||
else {
|
||||
originalQualScores[iii] = (byte) 0;
|
||||
}
|
||||
if (setBaseN) {
|
||||
readBases[iii] = (byte) 'N';
|
||||
}
|
||||
}
|
||||
// Set the prev base to Q0 as well
|
||||
if (readBases[iii - 1] == refBases[iii - 1]) {
|
||||
if (negStrand) {
|
||||
originalQualScores[originalQualScores.length - iii] = (byte) 0;
|
||||
}
|
||||
else {
|
||||
originalQualScores[iii - 1] = (byte) 0;
|
||||
}
|
||||
if (setBaseN) {
|
||||
readBases[iii - 1] = (byte) 'N';
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (negStrand) {
|
||||
readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read
|
||||
}
|
||||
read.setReadBases(readBases);
|
||||
|
||||
return originalQualScores;
|
||||
}
|
||||
|
||||
/**
|
||||
* Peform the REMOVE_REF_BIAS solid recalibration. Look at the color space qualities and probabilistically decide if the base should be change to match the color or left as reference
|
||||
*
|
||||
* @param read The SAMRecord to recalibrate
|
||||
* @param readBases The bases in the read which have been RC'd if necessary
|
||||
* @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
|
||||
* @param colorImpliedBases The bases implied by the color space, RC'd if necessary
|
||||
* @param refBases The reference which has been RC'd if necessary
|
||||
*/
|
||||
private static void solidRecalRemoveRefBias(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] colorImpliedBases, final byte[] refBases) {
|
||||
|
||||
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG);
|
||||
if (attr != null) {
|
||||
byte[] colorSpaceQuals;
|
||||
if (attr instanceof String) {
|
||||
String x = (String) attr;
|
||||
colorSpaceQuals = x.getBytes();
|
||||
SAMUtils.fastqToPhred(colorSpaceQuals);
|
||||
}
|
||||
else {
|
||||
throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
|
||||
for (int iii = 1; iii < inconsistency.length - 1; iii++) {
|
||||
if (inconsistency[iii] == 1) {
|
||||
for (int jjj = iii - 1; jjj <= iii; jjj++) { // Correct this base and the one before it along the direction of the read
|
||||
if (jjj == iii || inconsistency[jjj] == 0) { // Don't want to correct the previous base a second time if it was already corrected in the previous step
|
||||
if (readBases[jjj] == refBases[jjj]) {
|
||||
if (colorSpaceQuals[jjj] == colorSpaceQuals[jjj + 1]) { // Equal evidence for the color implied base and the reference base, so flip a coin
|
||||
final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(2);
|
||||
if (rand == 0) { // The color implied base won the coin flip
|
||||
readBases[jjj] = colorImpliedBases[jjj];
|
||||
}
|
||||
}
|
||||
else {
|
||||
final int maxQuality = Math.max((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]);
|
||||
final int minQuality = Math.min((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]);
|
||||
int diffInQuality = maxQuality - minQuality;
|
||||
int numLow = minQuality;
|
||||
if (numLow == 0) {
|
||||
numLow++;
|
||||
diffInQuality++;
|
||||
}
|
||||
final int numHigh = Math.round(numLow * (float) Math.pow(10.0f, (float) diffInQuality / 10.0f)); // The color with higher quality is exponentially more likely
|
||||
final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(numLow + numHigh);
|
||||
if (rand >= numLow) { // higher q score won
|
||||
if (maxQuality == (int) colorSpaceQuals[jjj]) {
|
||||
readBases[jjj] = colorImpliedBases[jjj];
|
||||
} // else ref color had higher q score, and won out, so nothing to do here
|
||||
}
|
||||
else { // lower q score won
|
||||
if (minQuality == (int) colorSpaceQuals[jjj]) {
|
||||
readBases[jjj] = colorImpliedBases[jjj];
|
||||
} // else ref color had lower q score, and won out, so nothing to do here
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (read.getReadNegativeStrandFlag()) {
|
||||
readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read
|
||||
}
|
||||
read.setReadBases(readBases);
|
||||
}
|
||||
else { // No color space quality tag in file
|
||||
throw new UserException.MalformedBAM(read, "REMOVE_REF_BIAS recal mode requires color space qualities but they can't be found for read: " + read.getReadName());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given the base and the color calculate the next base in the sequence
|
||||
*
|
||||
* @param prevBase The base
|
||||
* @param color The color
|
||||
* @return The next base in the sequence
|
||||
*/
|
||||
private static byte getNextBaseFromColor(GATKSAMRecord read, final byte prevBase, final byte color) {
|
||||
switch (color) {
|
||||
case '0':
|
||||
return prevBase;
|
||||
case '1':
|
||||
return performColorOne(prevBase);
|
||||
case '2':
|
||||
return performColorTwo(prevBase);
|
||||
case '3':
|
||||
return performColorThree(prevBase);
|
||||
default:
|
||||
throw new UserException.MalformedBAM(read, "Unrecognized color space in SOLID read, color = " + (char) color +
|
||||
" Unfortunately this bam file can not be recalibrated without full color space information because of potential reference bias.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if this base is inconsistent with its color space. If it is then SOLID inserted the reference here and we should reduce the quality
|
||||
*
|
||||
* @param read The read which contains the color space to check against
|
||||
* @param offset The offset in the read at which to check
|
||||
* @return Returns true if the base was inconsistent with the color space
|
||||
*/
|
||||
public static boolean isInconsistentColorSpace(final GATKSAMRecord read, final int offset) {
|
||||
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG);
|
||||
if (attr != null) {
|
||||
final byte[] inconsistency = (byte[]) attr;
|
||||
// NOTE: The inconsistency array is in the direction of the read, not aligned to the reference!
|
||||
if (read.getReadNegativeStrandFlag()) { // Negative direction
|
||||
return inconsistency[inconsistency.length - offset - 1] != (byte) 0;
|
||||
}
|
||||
else { // Forward direction
|
||||
return inconsistency[offset] != (byte) 0;
|
||||
}
|
||||
|
||||
// This block of code is for if you want to check both the offset and the next base for color space inconsistency
|
||||
//if( read.getReadNegativeStrandFlag() ) { // Negative direction
|
||||
// if( offset == 0 ) {
|
||||
// return inconsistency[0] != 0;
|
||||
// } else {
|
||||
// return (inconsistency[inconsistency.length - offset - 1] != 0) || (inconsistency[inconsistency.length - offset] != 0);
|
||||
// }
|
||||
//} else { // Forward direction
|
||||
// if( offset == inconsistency.length - 1 ) {
|
||||
// return inconsistency[inconsistency.length - 1] != 0;
|
||||
// } else {
|
||||
// return (inconsistency[offset] != 0) || (inconsistency[offset + 1] != 0);
|
||||
// }
|
||||
//}
|
||||
|
||||
}
|
||||
else { // No inconsistency array, so nothing is inconsistent
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes all requested covariates for every offset in the given read
|
||||
* by calling covariate.getValues(..).
|
||||
*
|
||||
* @param read The read for which to compute covariate values.
|
||||
* @param requestedCovariates The list of requested covariates.
|
||||
* @return An array of covariate values where result[i][j] is the covariate
|
||||
* value for the ith position in the read and the jth covariate in
|
||||
* reqeustedCovariates list.
|
||||
*/
|
||||
public static void computeCovariates(final GATKSAMRecord read, final List<Covariate> requestedCovariates) {
|
||||
final int numRequestedCovariates = requestedCovariates.size();
|
||||
final int readLength = read.getReadLength();
|
||||
final CovariateKeySet covariateKeySet = new CovariateKeySet(readLength, numRequestedCovariates);
|
||||
|
||||
// Loop through the list of requested covariates and compute the values of each covariate for all positions in this read
|
||||
for (Covariate covariate : requestedCovariates)
|
||||
covariateKeySet.addCovariate(covariate.getValues(read));
|
||||
|
||||
read.setTemporaryAttribute(COVARS_ATTRIBUTE, covariateKeySet);
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform a certain transversion (A <-> C or G <-> T) on the base.
|
||||
*
|
||||
* @param base the base [AaCcGgTt]
|
||||
* @return the transversion of the base, or the input base if it's not one of the understood ones
|
||||
*/
|
||||
private static byte performColorOne(byte base) {
|
||||
switch (base) {
|
||||
case 'A':
|
||||
case 'a':
|
||||
return 'C';
|
||||
case 'C':
|
||||
case 'c':
|
||||
return 'A';
|
||||
case 'G':
|
||||
case 'g':
|
||||
return 'T';
|
||||
case 'T':
|
||||
case 't':
|
||||
return 'G';
|
||||
default:
|
||||
return base;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform a transition (A <-> G or C <-> T) on the base.
|
||||
*
|
||||
* @param base the base [AaCcGgTt]
|
||||
* @return the transition of the base, or the input base if it's not one of the understood ones
|
||||
*/
|
||||
private static byte performColorTwo(byte base) {
|
||||
switch (base) {
|
||||
case 'A':
|
||||
case 'a':
|
||||
return 'G';
|
||||
case 'C':
|
||||
case 'c':
|
||||
return 'T';
|
||||
case 'G':
|
||||
case 'g':
|
||||
return 'A';
|
||||
case 'T':
|
||||
case 't':
|
||||
return 'C';
|
||||
default:
|
||||
return base;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the complement (A <-> T or C <-> G) of a base.
|
||||
*
|
||||
* @param base the base [AaCcGgTt]
|
||||
* @return the complementary base, or the input base if it's not one of the understood ones
|
||||
*/
|
||||
private static byte performColorThree(byte base) {
|
||||
switch (base) {
|
||||
case 'A':
|
||||
case 'a':
|
||||
return 'T';
|
||||
case 'C':
|
||||
case 'c':
|
||||
return 'G';
|
||||
case 'G':
|
||||
case 'g':
|
||||
return 'C';
|
||||
case 'T':
|
||||
case 't':
|
||||
return 'A';
|
||||
default:
|
||||
return base;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,112 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Nov 3, 2009
|
||||
*
|
||||
* An individual piece of recalibration data. Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates.
|
||||
*/
|
||||
|
||||
public class RecalDatum extends RecalDatumOptimized {
|
||||
|
||||
private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations
|
||||
private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example)
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// constructors
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public RecalDatum() {
|
||||
numObservations = 0L;
|
||||
numMismatches = 0L;
|
||||
estimatedQReported = 0.0;
|
||||
empiricalQuality = 0.0;
|
||||
}
|
||||
|
||||
public RecalDatum(final long _numObservations, final long _numMismatches, final double _estimatedQReported, final double _empiricalQuality) {
|
||||
numObservations = _numObservations;
|
||||
numMismatches = _numMismatches;
|
||||
estimatedQReported = _estimatedQReported;
|
||||
empiricalQuality = _empiricalQuality;
|
||||
}
|
||||
|
||||
public RecalDatum(final RecalDatum copy) {
|
||||
this.numObservations = copy.numObservations;
|
||||
this.numMismatches = copy.numMismatches;
|
||||
this.estimatedQReported = copy.estimatedQReported;
|
||||
this.empiricalQuality = copy.empiricalQuality;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// increment methods
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public final void combine(final RecalDatum other) {
|
||||
final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors();
|
||||
this.increment(other.numObservations, other.numMismatches);
|
||||
this.estimatedQReported = -10 * Math.log10(sumErrors / (double) this.numObservations);
|
||||
//if( this.estimatedQReported > QualityUtils.MAX_REASONABLE_Q_SCORE ) { this.estimatedQReported = QualityUtils.MAX_REASONABLE_Q_SCORE; }
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// methods to derive empirical quality score
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public final void calcCombinedEmpiricalQuality(final int smoothing, final int maxQual) {
|
||||
this.empiricalQuality = empiricalQualDouble(smoothing, maxQual); // cache the value so we don't call log over and over again
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// misc. methods
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public final double getEstimatedQReported() {
|
||||
return estimatedQReported;
|
||||
}
|
||||
|
||||
public final double getEmpiricalQuality() {
|
||||
return empiricalQuality;
|
||||
}
|
||||
|
||||
private double calcExpectedErrors() {
|
||||
return (double) this.numObservations * qualToErrorProb(estimatedQReported);
|
||||
}
|
||||
|
||||
private double qualToErrorProb(final double qual) {
|
||||
return Math.pow(10.0, qual / -10.0);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,115 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Jan 6, 2010
|
||||
*
|
||||
* An individual piece of recalibration data. Optimized for CountCovariates. Extras added to make TableRecalibration fast have been removed.
|
||||
* Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates.
|
||||
*/
|
||||
|
||||
public class RecalDatumOptimized {
|
||||
|
||||
protected long numObservations; // number of bases seen in total
|
||||
protected long numMismatches; // number of bases seen that didn't match the reference
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// constructors
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public RecalDatumOptimized() {
|
||||
numObservations = 0L;
|
||||
numMismatches = 0L;
|
||||
}
|
||||
|
||||
public RecalDatumOptimized(final long _numObservations, final long _numMismatches) {
|
||||
numObservations = _numObservations;
|
||||
numMismatches = _numMismatches;
|
||||
}
|
||||
|
||||
public RecalDatumOptimized(final RecalDatumOptimized copy) {
|
||||
this.numObservations = copy.numObservations;
|
||||
this.numMismatches = copy.numMismatches;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// increment methods
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public synchronized final void increment(final long incObservations, final long incMismatches) {
|
||||
numObservations += incObservations;
|
||||
numMismatches += incMismatches;
|
||||
}
|
||||
|
||||
public synchronized final void increment(final RecalDatumOptimized other) {
|
||||
increment(other.numObservations, other.numMismatches);
|
||||
}
|
||||
|
||||
public synchronized final void increment(final List<RecalDatumOptimized> data) {
|
||||
for (RecalDatumOptimized other : data) {
|
||||
this.increment(other);
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// methods to derive empirical quality score
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public final double empiricalQualDouble(final int smoothing, final double maxQual) {
|
||||
final double doubleMismatches = (double) (numMismatches + smoothing);
|
||||
final double doubleObservations = (double) (numObservations + smoothing);
|
||||
double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations);
|
||||
return Math.min(empiricalQual, maxQual);
|
||||
}
|
||||
|
||||
public final byte empiricalQualByte(final int smoothing) {
|
||||
final double doubleMismatches = (double) (numMismatches + smoothing);
|
||||
final double doubleObservations = (double) (numObservations + smoothing);
|
||||
return QualityUtils.probToQual(1.0 - doubleMismatches / doubleObservations); // This is capped at Q40
|
||||
}
|
||||
|
||||
public final byte empiricalQualByte() {
|
||||
return empiricalQualByte(0); // 'default' behavior is to use smoothing value of zero
|
||||
}
|
||||
|
||||
public final String outputToCSV() {
|
||||
return String.format("%d,%d,%d", numObservations, numMismatches, (int) empiricalQualByte());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,165 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.CountCovariatesGatherer;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Nov 27, 2009
|
||||
*
|
||||
* A collection of the arguments that are common to both CovariateCounterWalker and TableRecalibrationWalker.
|
||||
* This set of arguments will also be passed to the constructor of every Covariate when it is instantiated.
|
||||
*/
|
||||
|
||||
public class RecalibrationArgumentCollection {
|
||||
|
||||
/**
|
||||
* This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference,
|
||||
* so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites. This tool accepts any number of RodBindings (VCF, Bed, etc.)
|
||||
* for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites.
|
||||
* Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument.
|
||||
*/
|
||||
@Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false)
|
||||
protected List<RodBinding<Feature>> knownSites = Collections.emptyList();
|
||||
|
||||
/**
|
||||
* After the header, data records occur one per line until the end of the file. The first several items on a line are the
|
||||
* values of the individual covariates and will change depending on which covariates were specified at runtime. The last
|
||||
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
|
||||
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
||||
*/
|
||||
@Gather(CountCovariatesGatherer.class)
|
||||
@Output
|
||||
protected PrintStream RECAL_FILE;
|
||||
|
||||
/**
|
||||
* List all implemented covariates.
|
||||
*/
|
||||
@Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false)
|
||||
protected boolean LIST_ONLY = false;
|
||||
|
||||
/**
|
||||
* Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you. See the list of covariates with -list.
|
||||
*/
|
||||
@Argument(fullName = "covariate", shortName = "cov", doc = "Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required = false)
|
||||
protected String[] COVARIATES = null;
|
||||
|
||||
/*
|
||||
* Use the standard set of covariates in addition to the ones listed using the -cov argument
|
||||
*/
|
||||
@Argument(fullName = "standard_covs", shortName = "standard", doc = "Use the standard set of covariates in addition to the ones listed using the -cov argument", required = false)
|
||||
protected boolean USE_STANDARD_COVARIATES = true;
|
||||
|
||||
/////////////////////////////
|
||||
// Debugging-only Arguments
|
||||
/////////////////////////////
|
||||
/**
|
||||
* This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option.
|
||||
*/
|
||||
@Hidden
|
||||
@Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
|
||||
protected boolean RUN_WITHOUT_DBSNP = false;
|
||||
|
||||
/////////////////////////////
|
||||
// protected Member Variables
|
||||
/////////////////////////////
|
||||
protected final RecalDataManager dataManager = new RecalDataManager(); // Holds the data HashMap used to create collapsed data hashmaps (delta delta tables)
|
||||
protected final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>();// A list to hold the covariate objects that were requested
|
||||
|
||||
protected final String SKIP_RECORD_ATTRIBUTE = "SKIP"; // used to label reads that should be skipped.
|
||||
protected final String SEEN_ATTRIBUTE = "SEEN"; // used to label reads as processed.
|
||||
|
||||
|
||||
/**
|
||||
* CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
|
||||
* reads which have had the reference inserted because of color space inconsistencies.
|
||||
*/
|
||||
@Argument(fullName = "solid_recal_mode", shortName = "sMode", required = false, doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS")
|
||||
public RecalDataManager.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.SET_Q_ZERO;
|
||||
|
||||
/**
|
||||
* CountCovariates and TableRecalibration accept a --solid_nocall_strategy <MODE> flag which governs how the recalibrator handles
|
||||
* no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in
|
||||
* their color space tag can not be recalibrated.
|
||||
*/
|
||||
@Argument(fullName = "solid_nocall_strategy", shortName = "solid_nocall_strategy", doc = "Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required = false)
|
||||
public RecalDataManager.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION;
|
||||
|
||||
/**
|
||||
* The context covariate will use a context of this size to calculate it's covariate value for base mismatches
|
||||
*/
|
||||
@Argument(fullName = "mismatches_context_size", shortName = "mcs", doc = "size of the k-mer context to be used for base mismatches", required = false)
|
||||
public int MISMATCHES_CONTEXT_SIZE = 2;
|
||||
|
||||
/**
|
||||
* The context covariate will use a context of this size to calculate it's covariate value for base insertions
|
||||
*/
|
||||
@Argument(fullName = "insertions_context_size", shortName = "ics", doc = "size of the k-mer context to be used for base insertions", required = false)
|
||||
public int INSERTIONS_CONTEXT_SIZE = 8;
|
||||
|
||||
/**
|
||||
* The context covariate will use a context of this size to calculate it's covariate value for base deletions
|
||||
*/
|
||||
@Argument(fullName = "deletions_context_size", shortName = "dcs", doc = "size of the k-mer context to be used for base deletions", required = false)
|
||||
public int DELETIONS_CONTEXT_SIZE = 8;
|
||||
|
||||
/**
|
||||
* A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
|
||||
*/
|
||||
@Argument(fullName = "mismatches_default_quality", shortName = "mdq", doc = "default quality for the base mismatches covariate", required = false)
|
||||
public byte MISMATCHES_DEFAULT_QUALITY = -1;
|
||||
|
||||
/**
|
||||
* A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used for all reads without insertion quality scores for each base. (default is on)
|
||||
*/
|
||||
@Argument(fullName = "insertions_default_quality", shortName = "idq", doc = "default quality for the base insertions covariate", required = false)
|
||||
public byte INSERTIONS_DEFAULT_QUALITY = 45;
|
||||
|
||||
/**
|
||||
* A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
|
||||
*/
|
||||
@Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false)
|
||||
public byte DELETIONS_DEFAULT_QUALITY = 45;
|
||||
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
|
||||
public String DEFAULT_PLATFORM = null;
|
||||
@Hidden
|
||||
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
|
||||
public String FORCE_PLATFORM = null;
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -26,8 +26,10 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.coverage;
|
||||
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.commandline.Advanced;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -113,27 +115,13 @@ import java.util.*;
|
|||
// todo -- allow for user to set linear binning (default is logarithmic)
|
||||
// todo -- formatting --> do something special for end bins in getQuantile(int[] foo), this gets mushed into the end+-1 bins for now
|
||||
@By(DataSource.REFERENCE)
|
||||
@PartitionBy(PartitionType.INTERVAL)
|
||||
@PartitionBy(PartitionType.NONE)
|
||||
@Downsample(by= DownsampleType.NONE, toCoverage=Integer.MAX_VALUE)
|
||||
public class DepthOfCoverageWalker extends LocusWalker<Map<DoCOutputType.Partition,Map<String,int[]>>, CoveragePartitioner> implements TreeReducible<CoveragePartitioner> {
|
||||
@Output
|
||||
@Multiplex(value=DoCOutputMultiplexer.class,arguments={"partitionTypes","refSeqGeneList","omitDepthOutput","omitIntervals","omitSampleSummary","omitLocusTable"})
|
||||
Map<DoCOutputType,PrintStream> out;
|
||||
|
||||
/**
|
||||
* Sets the low-coverage cutoff for granular binning. All loci with depth < START are counted in the first bin.
|
||||
*/
|
||||
@Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", required = false)
|
||||
int start = 1;
|
||||
/**
|
||||
* Sets the high-coverage cutoff for granular binning. All loci with depth > END are counted in the last bin.
|
||||
*/
|
||||
@Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", required = false)
|
||||
int stop = 500;
|
||||
/**
|
||||
* Sets the number of bins for granular binning
|
||||
*/
|
||||
@Argument(fullName = "nBins", doc = "Number of bins to use for granular binning", required = false)
|
||||
int nBins = 499;
|
||||
@Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to -1.", required = false)
|
||||
int minMappingQuality = -1;
|
||||
@Argument(fullName = "maxMappingQuality", doc = "Maximum mapping quality of reads to count towards depth. Defaults to 2^31-1 (Integer.MAX_VALUE).", required = false)
|
||||
|
|
@ -142,16 +130,19 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<DoCOutputType.Partiti
|
|||
byte minBaseQuality = -1;
|
||||
@Argument(fullName = "maxBaseQuality", doc = "Maximum quality of bases to count towards depth. Defaults to 127 (Byte.MAX_VALUE).", required = false)
|
||||
byte maxBaseQuality = Byte.MAX_VALUE;
|
||||
|
||||
/**
|
||||
* Instead of reporting depth, report the base pileup at each locus
|
||||
*/
|
||||
@Argument(fullName = "printBaseCounts", shortName = "baseCounts", doc = "Will add base counts to per-locus output.", required = false)
|
||||
boolean printBaseCounts = false;
|
||||
|
||||
/**
|
||||
* Do not tabulate locus statistics (# loci covered by sample by coverage)
|
||||
*/
|
||||
@Argument(fullName = "omitLocusTable", shortName = "omitLocusTable", doc = "Will not calculate the per-sample per-depth counts of loci, which should result in speedup", required = false)
|
||||
boolean omitLocusTable = false;
|
||||
|
||||
/**
|
||||
* Do not tabulate interval statistics (mean, median, quartiles AND # intervals by sample by coverage)
|
||||
*/
|
||||
|
|
@ -162,8 +153,52 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<DoCOutputType.Partiti
|
|||
*/
|
||||
@Argument(fullName = "omitDepthOutputAtEachBase", shortName = "omitBaseOutput", doc = "Will omit the output of the depth of coverage at each base, which should result in speedup", required = false)
|
||||
boolean omitDepthOutput = false;
|
||||
|
||||
/**
|
||||
* Path to the RefSeq file for use in aggregating coverage statistics over genes
|
||||
*/
|
||||
@Argument(fullName = "calculateCoverageOverGenes", shortName = "geneList", doc = "Calculate the coverage statistics over this list of genes. Currently accepts RefSeq.", required = false)
|
||||
File refSeqGeneList = null;
|
||||
|
||||
/**
|
||||
* The format of the output file
|
||||
*/
|
||||
@Argument(fullName = "outputFormat", doc = "the format of the output file (e.g. csv, table, rtable); defaults to r-readable table", required = false)
|
||||
String outputFormat = "rtable";
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
//
|
||||
// Advanced arguments
|
||||
//
|
||||
// ---------------------------------------------------------------------------
|
||||
@Advanced
|
||||
@Argument(fullName = "includeRefNSites", doc = "If provided, sites with reference N bases but with coverage from neighboring reads will be included in DoC calculations.", required = false)
|
||||
boolean includeRefNBases = false;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName = "printBinEndpointsAndExit", doc = "Prints the bin values and exits immediately. Use to calibrate what bins you want before running on data.", required = false)
|
||||
boolean printBinEndpointsAndExit = false;
|
||||
|
||||
/**
|
||||
* Sets the low-coverage cutoff for granular binning. All loci with depth < START are counted in the first bin.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", required = false)
|
||||
int start = 1;
|
||||
/**
|
||||
* Sets the high-coverage cutoff for granular binning. All loci with depth > END are counted in the last bin.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", required = false)
|
||||
int stop = 500;
|
||||
/**
|
||||
* Sets the number of bins for granular binning
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "nBins", doc = "Number of bins to use for granular binning", required = false)
|
||||
int nBins = 499;
|
||||
|
||||
/**
|
||||
* Do not tabulate the sample summary statistics (total, mean, median, quartile coverage per sample)
|
||||
*/
|
||||
|
|
@ -174,27 +209,22 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<DoCOutputType.Partiti
|
|||
*/
|
||||
@Argument(fullName = "partitionType", shortName = "pt", doc = "Partition type for depth of coverage. Defaults to sample. Can be any combination of sample, readgroup, library.", required = false)
|
||||
Set<DoCOutputType.Partition> partitionTypes = EnumSet.of(DoCOutputType.Partition.sample);
|
||||
|
||||
/**
|
||||
* Consider a spanning deletion as contributing to coverage. Also enables deletion counts in per-base output.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "includeDeletions", shortName = "dels", doc = "Include information on deletions", required = false)
|
||||
boolean includeDeletions = false;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName = "ignoreDeletionSites", doc = "Ignore sites consisting only of deletions", required = false)
|
||||
boolean ignoreDeletionSites = false;
|
||||
|
||||
/**
|
||||
* Path to the RefSeq file for use in aggregating coverage statistics over genes
|
||||
*/
|
||||
@Argument(fullName = "calculateCoverageOverGenes", shortName = "geneList", doc = "Calculate the coverage statistics over this list of genes. Currently accepts RefSeq.", required = false)
|
||||
File refSeqGeneList = null;
|
||||
/**
|
||||
* The format of the output file
|
||||
*/
|
||||
@Argument(fullName = "outputFormat", doc = "the format of the output file (e.g. csv, table, rtable); defaults to r-readable table", required = false)
|
||||
String outputFormat = "rtable";
|
||||
/**
|
||||
* A coverage threshold for summarizing (e.g. % bases >= CT for each sample)
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "summaryCoverageThreshold", shortName = "ct", doc = "for summary file outputs, report the % of bases coverd to >= this number. Defaults to 15; can take multiple arguments.", required = false)
|
||||
int[] coverageThresholds = {15};
|
||||
|
||||
|
|
@ -334,24 +364,29 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<DoCOutputType.Partiti
|
|||
}
|
||||
|
||||
public Map<DoCOutputType.Partition,Map<String,int[]>> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if (includeRefNBases || BaseUtils.isRegularBase(ref.getBase())) {
|
||||
if ( ! omitDepthOutput ) {
|
||||
getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary).printf("%s",ref.getLocus()); // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives)
|
||||
//System.out.printf("\t[log]\t%s",ref.getLocus());
|
||||
}
|
||||
|
||||
if ( ! omitDepthOutput ) {
|
||||
getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary).printf("%s",ref.getLocus()); // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives)
|
||||
//System.out.printf("\t[log]\t%s",ref.getLocus());
|
||||
return CoverageUtils.getBaseCountsByPartition(context,minMappingQuality,maxMappingQuality,minBaseQuality,maxBaseQuality,partitionTypes);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
|
||||
return CoverageUtils.getBaseCountsByPartition(context,minMappingQuality,maxMappingQuality,minBaseQuality,maxBaseQuality,partitionTypes);
|
||||
}
|
||||
|
||||
public CoveragePartitioner reduce(Map<DoCOutputType.Partition,Map<String,int[]>> thisMap, CoveragePartitioner prevReduce) {
|
||||
if ( ! omitDepthOutput ) {
|
||||
//checkOrder(prevReduce); // tests prevReduce.getIdentifiersByType().get(t) against the initialized header order
|
||||
printDepths(getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary),thisMap,prevReduce.getIdentifiersByType());
|
||||
// this is an additional iteration through thisMap, plus dealing with IO, so should be much slower without
|
||||
// turning on omit
|
||||
}
|
||||
if ( thisMap != null ) { // skip sites we didn't want to include in the calculation (ref Ns)
|
||||
if ( ! omitDepthOutput ) {
|
||||
//checkOrder(prevReduce); // tests prevReduce.getIdentifiersByType().get(t) against the initialized header order
|
||||
printDepths(getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary),thisMap,prevReduce.getIdentifiersByType());
|
||||
// this is an additional iteration through thisMap, plus dealing with IO, so should be much slower without
|
||||
// turning on omit
|
||||
}
|
||||
|
||||
prevReduce.update(thisMap); // note that in "useBoth" cases, this method alters the thisMap object
|
||||
prevReduce.update(thisMap); // note that in "useBoth" cases, this method alters the thisMap object
|
||||
}
|
||||
|
||||
return prevReduce;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,162 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.diagnostics;
|
||||
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReport;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Computes the read error rate per position in read (in the original 5'->3' orientation that the read had coming off the machine)
|
||||
*
|
||||
* Emits a GATKReport containing readgroup, cycle, mismatches, counts, qual, and error rate for each read
|
||||
* group in the input BAMs FOR ONLY THE FIRST OF PAIR READS.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* Any number of BAM files
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* GATKReport containing readgroup, cycle, mismatches, counts, qual, and error rate.
|
||||
*
|
||||
* For example, running this tool on the NA12878 data sets:
|
||||
*
|
||||
* <pre>
|
||||
* ##:GATKReport.v0.2 ErrorRatePerCycle : The error rate per sequenced position in the reads
|
||||
* readgroup cycle mismatches counts qual errorrate
|
||||
* 20FUK.1 0 80 23368 25 3.47e-03
|
||||
* 20FUK.1 1 40 23433 28 1.75e-03
|
||||
* 20FUK.1 2 36 23453 28 1.58e-03
|
||||
* 20FUK.1 3 26 23476 29 1.15e-03
|
||||
* 20FUK.1 4 32 23495 29 1.40e-03
|
||||
* up to 101 cycles
|
||||
* 20FUK.2 0 77 20886 24 3.73e-03
|
||||
* 20FUK.2 1 28 20920 29 1.39e-03
|
||||
* 20FUK.2 2 24 20931 29 1.19e-03
|
||||
* 20FUK.2 3 30 20940 28 1.48e-03
|
||||
* 20FUK.2 4 25 20948 29 1.24e-03
|
||||
* up to 101 cycles
|
||||
* 20FUK.3 0 78 22038 24 3.58e-03
|
||||
* 20FUK.3 1 40 22091 27 1.86e-03
|
||||
* 20FUK.3 2 23 22108 30 1.09e-03
|
||||
* 20FUK.3 3 36 22126 28 1.67e-03
|
||||
* </pre>
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
* -T ErrorRatePerCycle
|
||||
* -I bundle/current/b37/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam
|
||||
* -R bundle/current/b37/human_g1k_v37.fasta
|
||||
* -o example.gatkreport.txt
|
||||
* </pre>
|
||||
*
|
||||
* @author Kiran Garimella, Mark DePristo
|
||||
*/
|
||||
public class ErrorRatePerCycle extends LocusWalker<Integer, Integer> {
|
||||
@Output PrintStream out;
|
||||
@Argument(fullName="min_base_quality_score", shortName="mbq", doc="Minimum base quality required to consider a base for calling", required=false)
|
||||
public Integer MIN_BASE_QUAL = 0;
|
||||
@Argument(fullName="min_mapping_quality_score", shortName="mmq", doc="Minimum read mapping quality required to consider a read for calling", required=false)
|
||||
public Integer MIN_MAPPING_QUAL = 20;
|
||||
|
||||
private GATKReport report;
|
||||
private GATKReportTable table;
|
||||
private final static String reportName = "ErrorRatePerCycle";
|
||||
private final static String reportDescription = "The error rate per sequenced position in the reads";
|
||||
|
||||
/**
|
||||
* Allows us to use multiple records for the key (read group x cycle)
|
||||
*/
|
||||
private static class TableKey implements Comparable<TableKey> {
|
||||
final String readGroup;
|
||||
final int cycle;
|
||||
|
||||
private TableKey(final String readGroup, final int cycle) {
|
||||
this.readGroup = readGroup;
|
||||
this.cycle = cycle;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(final TableKey tableKey) {
|
||||
final int scmp = readGroup.compareTo(tableKey.readGroup);
|
||||
if ( scmp == 0 )
|
||||
return Integer.valueOf(cycle).compareTo(tableKey.cycle);
|
||||
else
|
||||
return scmp;
|
||||
}
|
||||
}
|
||||
|
||||
public void initialize() {
|
||||
report = new GATKReport();
|
||||
report.addTable(reportName, reportDescription);
|
||||
table = report.getTable(reportName);
|
||||
table.addPrimaryKey("key", false);
|
||||
table.addColumn("readgroup", 0);
|
||||
table.addColumn("cycle", 0);
|
||||
table.addColumn("mismatches", 0);
|
||||
table.addColumn("counts", 0);
|
||||
table.addColumn("qual", 0);
|
||||
table.addColumn("errorrate", 0.0f, "%.2e");
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
for ( final PileupElement p : context.getBasePileup() ) {
|
||||
final GATKSAMRecord read = p.getRead();
|
||||
final int offset = p.getOffset();
|
||||
final boolean firstOfPair = ! read.getReadPairedFlag() || read.getFirstOfPairFlag();
|
||||
|
||||
if ( firstOfPair && read.getMappingQuality() >= MIN_MAPPING_QUAL && p.getQual() >= MIN_BASE_QUAL ) {
|
||||
final byte readBase = p.getBase();
|
||||
final byte refBase = ref.getBase();
|
||||
final int cycle = offset;
|
||||
|
||||
if ( BaseUtils.isRegularBase(readBase) && BaseUtils.isRegularBase(refBase) ) {
|
||||
final TableKey key = new TableKey(read.getReadGroup().getReadGroupId(), cycle);
|
||||
|
||||
if ( ! table.containsKey(key) ) {
|
||||
table.set(key, "cycle", cycle);
|
||||
table.set(key, "readgroup", read.getReadGroup().getReadGroupId());
|
||||
}
|
||||
|
||||
table.increment(key, "counts");
|
||||
if (readBase != refBase)
|
||||
table.increment(key, "mismatches");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public Integer reduceInit() { return null; }
|
||||
|
||||
public Integer reduce(Integer value, Integer sum) { return null; }
|
||||
|
||||
public void onTraversalDone(Integer sum) {
|
||||
for ( final Object key : table.getPrimaryKeys() ) {
|
||||
final int mismatches = (Integer)table.get(key, "mismatches");
|
||||
final int count = (Integer)table.get(key, "counts");
|
||||
final double errorRate = (mismatches + 1) / (1.0*(count + 1));
|
||||
final int qual = QualityUtils.probToQual(1-errorRate, 0.0);
|
||||
table.set(key, "qual", qual);
|
||||
table.set(key, "errorrate", errorRate);
|
||||
}
|
||||
|
||||
report.print(out);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,223 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics;
|
||||
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReport;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.utils.Median;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.text.DateFormat;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Emits a GATKReport containing read group, sample, library, platform, center, sequencing data,
|
||||
* paired end status, simple read type name (e.g. 2x76) median insert size and median read length
|
||||
* for each read group in every provided BAM file
|
||||
*
|
||||
* Note that this walker stops when all read groups have been observed at least a few thousand times so that
|
||||
* the median statistics are well determined. It is safe to run it WG and it'll finish in an appropriate
|
||||
* timeframe.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* Any number of BAM files
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* GATKReport containing read group, sample, library, platform, center, median insert size and median read length.
|
||||
*
|
||||
* For example, running this tool on the NA12878 data sets:
|
||||
*
|
||||
* <pre>
|
||||
* ##:GATKReport.v0.2 ReadGroupProperties : Table of read group properties
|
||||
* readgroup sample library platform center date has.any.reads is.paired.end n.reads.analyzed simple.read.type median.read.length median.insert.size
|
||||
* 20FUK.1 NA12878 Solexa-18483 illumina BI 2/2/10 true true 498 2x101 101 386
|
||||
* 20FUK.2 NA12878 Solexa-18484 illumina BI 2/2/10 true true 476 2x101 101 417
|
||||
* 20FUK.3 NA12878 Solexa-18483 illumina BI 2/2/10 true true 407 2x101 101 387
|
||||
* 20FUK.4 NA12878 Solexa-18484 illumina BI 2/2/10 true true 389 2x101 101 415
|
||||
* 20FUK.5 NA12878 Solexa-18483 illumina BI 2/2/10 true true 433 2x101 101 386
|
||||
* 20FUK.6 NA12878 Solexa-18484 illumina BI 2/2/10 true true 480 2x101 101 418
|
||||
* 20FUK.7 NA12878 Solexa-18483 illumina BI 2/2/10 true true 450 2x101 101 386
|
||||
* 20FUK.8 NA12878 Solexa-18484 illumina BI 2/2/10 true true 438 2x101 101 418
|
||||
* 20GAV.1 NA12878 Solexa-18483 illumina BI 1/26/10 true true 490 2x101 101 391
|
||||
* 20GAV.2 NA12878 Solexa-18484 illumina BI 1/26/10 true true 485 2x101 101 417
|
||||
* 20GAV.3 NA12878 Solexa-18483 illumina BI 1/26/10 true true 460 2x101 101 392
|
||||
* 20GAV.4 NA12878 Solexa-18484 illumina BI 1/26/10 true true 434 2x101 101 415
|
||||
* 20GAV.5 NA12878 Solexa-18483 illumina BI 1/26/10 true true 479 2x101 101 389
|
||||
* 20GAV.6 NA12878 Solexa-18484 illumina BI 1/26/10 true true 461 2x101 101 416
|
||||
* 20GAV.7 NA12878 Solexa-18483 illumina BI 1/26/10 true true 509 2x101 101 386
|
||||
* 20GAV.8 NA12878 Solexa-18484 illumina BI 1/26/10 true true 476 2x101 101 410 101 414
|
||||
* </pre>
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
* -T ReadGroupProperties
|
||||
* -I example1.bam -I example2.bam etc
|
||||
* -R reference.fasta
|
||||
* -o example.gatkreport.txt
|
||||
* </pre>
|
||||
*
|
||||
* @author Mark DePristo
|
||||
*/
|
||||
public class ReadGroupProperties extends ReadWalker<Integer, Integer> {
|
||||
@Output
|
||||
public PrintStream out;
|
||||
|
||||
@Argument(shortName="maxElementsForMedian", doc="Calculate median from the first maxElementsForMedian values observed", required=false)
|
||||
public int MAX_VALUES_FOR_MEDIAN = 10000;
|
||||
|
||||
private final static String TABLE_NAME = "ReadGroupProperties";
|
||||
private final Map<String, PerReadGroupInfo> readGroupInfo = new HashMap<String, PerReadGroupInfo>();
|
||||
|
||||
private class PerReadGroupInfo {
|
||||
public final Median<Integer> readLength = new Median<Integer>(MAX_VALUES_FOR_MEDIAN);
|
||||
public final Median<Integer> insertSize = new Median<Integer>(MAX_VALUES_FOR_MEDIAN);
|
||||
public int nReadsSeen = 0, nReadsPaired = 0;
|
||||
|
||||
public boolean needsMoreData() {
|
||||
return ! readLength.isFull() || ! insertSize.isFull();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
for ( final SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) {
|
||||
readGroupInfo.put(rg.getId(), new PerReadGroupInfo());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean filter(ReferenceContext ref, GATKSAMRecord read) {
|
||||
return ! (read.getReadFailsVendorQualityCheckFlag() || read.getReadUnmappedFlag());
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isDone() {
|
||||
for ( PerReadGroupInfo info : readGroupInfo.values() ) {
|
||||
if ( info.needsMoreData() )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Integer map(ReferenceContext referenceContext, GATKSAMRecord read, ReadMetaDataTracker readMetaDataTracker) {
|
||||
final String rgID = read.getReadGroup().getId();
|
||||
final PerReadGroupInfo info = readGroupInfo.get(rgID);
|
||||
|
||||
if ( info.needsMoreData() ) {
|
||||
info.readLength.add(read.getReadLength());
|
||||
info.nReadsSeen++;
|
||||
if ( read.getReadPairedFlag() ) {
|
||||
info.nReadsPaired++;
|
||||
if ( read.getInferredInsertSize() != 0) {
|
||||
info.insertSize.add(Math.abs(read.getInferredInsertSize()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Integer reduceInit() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Integer reduce(Integer integer, Integer integer1) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTraversalDone(Integer sum) {
|
||||
final GATKReport report = new GATKReport();
|
||||
report.addTable(TABLE_NAME, "Table of read group properties");
|
||||
GATKReportTable table = report.getTable(TABLE_NAME);
|
||||
DateFormat dateFormatter = DateFormat.getDateInstance(DateFormat.SHORT);
|
||||
|
||||
table.addPrimaryKey("readgroup");
|
||||
//* Emits a GATKReport containing read group, sample, library, platform, center, median insert size and
|
||||
//* median read length for each read group in every BAM file.
|
||||
table.addColumn("sample", "NA");
|
||||
table.addColumn("library", "NA");
|
||||
table.addColumn("platform", "NA");
|
||||
table.addColumn("center", "NA");
|
||||
table.addColumn("date", "NA");
|
||||
table.addColumn("has.any.reads", "false");
|
||||
table.addColumn("is.paired.end", "false");
|
||||
table.addColumn("n.reads.analyzed", "NA");
|
||||
table.addColumn("simple.read.type", "NA");
|
||||
table.addColumn("median.read.length", Integer.valueOf(0));
|
||||
table.addColumn("median.insert.size", Integer.valueOf(0));
|
||||
|
||||
for ( final SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) {
|
||||
final String rgID = rg.getId();
|
||||
PerReadGroupInfo info = readGroupInfo.get(rgID);
|
||||
|
||||
// we are paired if > 25% of reads are paired
|
||||
final boolean isPaired = info.nReadsPaired / (1.0 * (info.nReadsSeen+1)) > 0.25;
|
||||
final boolean hasAnyReads = info.nReadsSeen > 0;
|
||||
final int readLength = info.readLength.getMedian(0);
|
||||
|
||||
setTableValue(table, rgID, "sample", rg.getSample());
|
||||
setTableValue(table, rgID, "library", rg.getLibrary());
|
||||
setTableValue(table, rgID, "platform", rg.getPlatform());
|
||||
setTableValue(table, rgID, "center", rg.getSequencingCenter());
|
||||
try {
|
||||
setTableValue(table, rgID, "date", rg.getRunDate() != null ? dateFormatter.format(rg.getRunDate()) : "NA");
|
||||
} catch ( NullPointerException e ) {
|
||||
// TODO: remove me when bug in Picard is fixed that causes NPE when date isn't present
|
||||
setTableValue(table, rgID, "date", "NA");
|
||||
}
|
||||
setTableValue(table, rgID, "has.any.reads", hasAnyReads);
|
||||
setTableValue(table, rgID, "is.paired.end", isPaired);
|
||||
setTableValue(table, rgID, "n.reads.analyzed", info.nReadsSeen);
|
||||
setTableValue(table, rgID, "simple.read.type", hasAnyReads ? String.format("%dx%d", isPaired ? 2 : 1, readLength) : "NA");
|
||||
setTableValue(table, rgID, "median.read.length", hasAnyReads ? readLength : "NA" );
|
||||
setTableValue(table, rgID, "median.insert.size", hasAnyReads && isPaired ? info.insertSize.getMedian(0) : "NA" );
|
||||
}
|
||||
|
||||
report.print(out);
|
||||
}
|
||||
|
||||
private final void setTableValue(GATKReportTable table, final String rgID, final String key, final Object value) {
|
||||
table.set(rgID, key, value == null ? "NA" : value);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
|
||||
|
||||
/**
|
||||
* Short one line description of the walker.
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 2/1/12
|
||||
*/
|
||||
public enum CallableStatus {
|
||||
/** the reference base was an N, which is not considered callable the GATK */
|
||||
REF_N,
|
||||
/** the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE */
|
||||
CALLABLE,
|
||||
/** absolutely no reads were seen at this locus, regardless of the filtering parameters */
|
||||
NO_COVERAGE,
|
||||
/** there were less than min. depth bases at the locus, after applying filters */
|
||||
LOW_COVERAGE,
|
||||
/** more than -maxDepth read at the locus, indicating some sort of mapping problem */
|
||||
EXCESSIVE_COVERAGE,
|
||||
/** more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads */
|
||||
POOR_QUALITY
|
||||
}
|
||||
|
|
@ -0,0 +1,172 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
|
||||
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.IntervalBinding;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.By;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocComparator;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
* Short one line description of the walker.
|
||||
*
|
||||
* <p>
|
||||
* [Long description of the walker]
|
||||
* </p>
|
||||
*
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* [Description of the Input]
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* [Description of the Output]
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
* -T [walker name]
|
||||
* </pre>
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 2/1/12
|
||||
*/
|
||||
@By(value = DataSource.READS)
|
||||
public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
||||
@Input(fullName = "interval_track", shortName = "int", doc = "", required = true)
|
||||
private IntervalBinding<Feature> intervalTrack = null;
|
||||
|
||||
@Output
|
||||
private PrintStream out = System.out;
|
||||
|
||||
@Argument(fullName = "expand_interval", shortName = "exp", doc = "", required = false)
|
||||
private int expandInterval = 50;
|
||||
|
||||
@Argument(fullName = "minimum_base_quality", shortName = "mbq", doc = "", required = false)
|
||||
private int minimumBaseQuality = 20;
|
||||
|
||||
@Argument(fullName = "minimum_mapping_quality", shortName = "mmq", doc = "", required = false)
|
||||
private int minimumMappingQuality = 20;
|
||||
|
||||
@Argument(fullName = "minimum_coverage", shortName = "mincov", doc = "", required = false)
|
||||
private int minimumCoverage = 5;
|
||||
|
||||
@Argument(fullName = "maximum_coverage", shortName = "maxcov", doc = "", required = false)
|
||||
private int maximumCoverage = 700;
|
||||
|
||||
private TreeSet<GenomeLoc> intervalList = null; // The list of intervals of interest (plus expanded intervals if user wants them)
|
||||
private HashMap<GenomeLoc, IntervalStatistics> intervalMap = null; // interval => statistics
|
||||
private Iterator<GenomeLoc> intervalListIterator; // An iterator to go over all the intervals provided as we traverse the genome
|
||||
private GenomeLoc currentInterval = null; // The "current" interval loaded and being filled with statistics
|
||||
private IntervalStatistics currentIntervalStatistics = null; // The "current" interval loaded and being filled with statistics
|
||||
|
||||
private GenomeLocParser parser; // just an object to allow us to create genome locs (for the expanded intervals)
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
super.initialize();
|
||||
|
||||
if (intervalTrack == null)
|
||||
throw new UserException("This tool currently only works if you provide an interval track");
|
||||
|
||||
parser = new GenomeLocParser(getToolkit().getMasterSequenceDictionary()); // Important to initialize the parser before creating the intervals below
|
||||
|
||||
List<GenomeLoc> originalList = intervalTrack.getIntervals(getToolkit()); // The original list of targets provided by the user that will be expanded or not depending on the options provided
|
||||
intervalList = new TreeSet<GenomeLoc>(new GenomeLocComparator());
|
||||
intervalMap = new HashMap<GenomeLoc, IntervalStatistics>(originalList.size() * 2);
|
||||
for (GenomeLoc interval : originalList)
|
||||
addAndExpandIntervalToLists(interval);
|
||||
|
||||
intervalListIterator = intervalList.iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
GenomeLoc refLocus = ref.getLocus();
|
||||
while (currentInterval == null || currentInterval.isBefore(refLocus)) {
|
||||
if (!intervalListIterator.hasNext())
|
||||
return 0L;
|
||||
|
||||
currentInterval = intervalListIterator.next();
|
||||
currentIntervalStatistics = intervalMap.get(currentInterval);
|
||||
}
|
||||
|
||||
if (currentInterval.isPast(refLocus))
|
||||
return 0L;
|
||||
|
||||
byte[] mappingQualities = context.getBasePileup().getMappingQuals();
|
||||
byte[] baseQualities = context.getBasePileup().getQuals();
|
||||
int coverage = context.getBasePileup().getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage();
|
||||
int rawCoverage = context.size();
|
||||
|
||||
IntervalStatisticLocus locusData = new IntervalStatisticLocus(mappingQualities, baseQualities, coverage, rawCoverage);
|
||||
currentIntervalStatistics.addLocus(refLocus, locusData);
|
||||
|
||||
return 1L;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long reduceInit() {
|
||||
return 0L;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long reduce(Long value, Long sum) {
|
||||
return sum + value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTraversalDone(Long result) {
|
||||
super.onTraversalDone(result);
|
||||
out.println("Interval\tCallStatus\tCOV\tAVG");
|
||||
for (GenomeLoc interval : intervalList) {
|
||||
IntervalStatistics stats = intervalMap.get(interval);
|
||||
out.println(String.format("%s\t%s\t%d\t%f", interval, stats.callableStatus(), stats.totalCoverage(), stats.averageCoverage()));
|
||||
}
|
||||
}
|
||||
|
||||
private GenomeLoc createIntervalBefore(GenomeLoc interval) {
|
||||
int start = Math.max(interval.getStart() - expandInterval, 0);
|
||||
int stop = Math.max(interval.getStart() - 1, 0);
|
||||
return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop);
|
||||
}
|
||||
|
||||
private GenomeLoc createIntervalAfter(GenomeLoc interval) {
|
||||
int contigLimit = getToolkit().getSAMFileHeader().getSequenceDictionary().getSequence(interval.getContigIndex()).getSequenceLength();
|
||||
int start = Math.min(interval.getStop() + 1, contigLimit);
|
||||
int stop = Math.min(interval.getStop() + expandInterval, contigLimit);
|
||||
return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop);
|
||||
}
|
||||
|
||||
private void addAndExpandIntervalToLists(GenomeLoc interval) {
|
||||
if (expandInterval > 0) {
|
||||
GenomeLoc before = createIntervalBefore(interval);
|
||||
GenomeLoc after = createIntervalAfter(interval);
|
||||
intervalList.add(before);
|
||||
intervalList.add(after);
|
||||
intervalMap.put(before, new IntervalStatistics(before, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality));
|
||||
intervalMap.put(after, new IntervalStatistics(after, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality));
|
||||
}
|
||||
intervalList.add(interval);
|
||||
intervalMap.put(interval, new IntervalStatistics(interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
|
||||
|
||||
/**
|
||||
* The definition of a locus for the DiagnoseTargets walker statistics calculation
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 2/3/12
|
||||
*/
|
||||
class IntervalStatisticLocus {
|
||||
private final byte[] mappingQuality;
|
||||
private final byte[] baseQuality;
|
||||
private final int coverage;
|
||||
private final int rawCoverage;
|
||||
|
||||
public IntervalStatisticLocus(byte[] mappingQuality, byte[] baseQuality, int coverage, int rawCoverage) {
|
||||
this.mappingQuality = mappingQuality;
|
||||
this.baseQuality = baseQuality;
|
||||
this.coverage = coverage;
|
||||
this.rawCoverage = rawCoverage;
|
||||
}
|
||||
|
||||
public IntervalStatisticLocus() {
|
||||
this(new byte[1], new byte[1], 0, 0);
|
||||
}
|
||||
|
||||
public int getCoverage() {
|
||||
return coverage;
|
||||
}
|
||||
|
||||
public int getRawCoverage() {
|
||||
return rawCoverage;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,122 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* Short one line description of the walker.
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 2/1/12
|
||||
*/
|
||||
class IntervalStatistics {
|
||||
private final GenomeLoc interval;
|
||||
private final ArrayList<IntervalStatisticLocus> loci;
|
||||
|
||||
private final int minimumCoverageThreshold;
|
||||
private final int maximumCoverageThreshold;
|
||||
private final int minimumMappingQuality;
|
||||
private final int minimumBaseQuality;
|
||||
|
||||
private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet)
|
||||
|
||||
private IntervalStatistics(GenomeLoc interval, ArrayList<IntervalStatisticLocus> loci, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) {
|
||||
this.interval = interval;
|
||||
this.loci = loci;
|
||||
this.minimumCoverageThreshold = minimumCoverageThreshold;
|
||||
this.maximumCoverageThreshold = maximumCoverageThreshold;
|
||||
this.minimumMappingQuality = minimumMappingQuality;
|
||||
this.minimumBaseQuality = minimumBaseQuality;
|
||||
}
|
||||
|
||||
public IntervalStatistics(GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) {
|
||||
this(interval, new ArrayList<IntervalStatisticLocus>(interval.size()), minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality);
|
||||
|
||||
// Initialize every loci (this way we don't have to worry about non-existent loci in the object
|
||||
for (int i = 0; i < interval.size(); i++)
|
||||
this.loci.add(i, new IntervalStatisticLocus());
|
||||
|
||||
}
|
||||
|
||||
public long totalCoverage() {
|
||||
if (preComputedTotalCoverage < 0)
|
||||
calculateTotalCoverage();
|
||||
return preComputedTotalCoverage;
|
||||
}
|
||||
|
||||
public double averageCoverage() {
|
||||
if (preComputedTotalCoverage < 0)
|
||||
calculateTotalCoverage();
|
||||
return (double) preComputedTotalCoverage / loci.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the callable status of the entire interval
|
||||
*
|
||||
* @return the callable status of the entire interval
|
||||
*/
|
||||
public CallableStatus callableStatus() {
|
||||
long max = -1;
|
||||
CallableStatus maxCallableStatus = null;
|
||||
HashMap<CallableStatus, Integer> statusCounts = new HashMap<CallableStatus, Integer>(CallableStatus.values().length);
|
||||
|
||||
// initialize the statusCounts with all callable states
|
||||
for (CallableStatus key : CallableStatus.values())
|
||||
statusCounts.put(key, 0);
|
||||
|
||||
// calculate the callable status for each locus
|
||||
for (int i = 0; i < loci.size(); i++) {
|
||||
CallableStatus status = callableStatus(i);
|
||||
int count = statusCounts.get(status) + 1;
|
||||
statusCounts.put(status, count);
|
||||
|
||||
if (count > max) {
|
||||
max = count;
|
||||
maxCallableStatus = status;
|
||||
}
|
||||
}
|
||||
|
||||
return maxCallableStatus;
|
||||
}
|
||||
|
||||
public void addLocus(GenomeLoc locus, IntervalStatisticLocus locusData) {
|
||||
if (!interval.containsP(locus))
|
||||
throw new ReviewedStingException(String.format("Locus %s is not part of the Interval", locus));
|
||||
|
||||
int locusIndex = locus.getStart() - interval.getStart();
|
||||
|
||||
loci.add(locusIndex, locusData);
|
||||
}
|
||||
|
||||
/**
|
||||
* returns the callable status of this locus without taking the reference base into account.
|
||||
*
|
||||
* @param locusIndex location in the genome to inquire (only one locus)
|
||||
* @return the callable status of a locus
|
||||
*/
|
||||
private CallableStatus callableStatus(int locusIndex) {
|
||||
if (loci.get(locusIndex).getCoverage() > maximumCoverageThreshold)
|
||||
return CallableStatus.EXCESSIVE_COVERAGE;
|
||||
|
||||
if (loci.get(locusIndex).getCoverage() >= minimumCoverageThreshold)
|
||||
return CallableStatus.CALLABLE;
|
||||
|
||||
if (loci.get(locusIndex).getRawCoverage() >= minimumCoverageThreshold)
|
||||
return CallableStatus.POOR_QUALITY;
|
||||
|
||||
if (loci.get(locusIndex).getRawCoverage() > 0)
|
||||
return CallableStatus.LOW_COVERAGE;
|
||||
|
||||
return CallableStatus.NO_COVERAGE;
|
||||
}
|
||||
|
||||
private void calculateTotalCoverage() {
|
||||
preComputedTotalCoverage = 0;
|
||||
for (IntervalStatisticLocus locus : loci)
|
||||
preComputedTotalCoverage += locus.getCoverage();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -139,6 +139,12 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
|
|||
@Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, missing values should be considered failing the expression", required=false)
|
||||
protected Boolean FAIL_MISSING_VALUES = false;
|
||||
|
||||
/**
|
||||
* Invalidate previous filters applied to the VariantContext, applying only the filters here
|
||||
*/
|
||||
@Argument(fullName="invalidatePreviousFilters",doc="Remove previous filters applied to the VCF",required=false)
|
||||
boolean invalidatePrevious = false;
|
||||
|
||||
// JEXL expressions for the filters
|
||||
List<VariantContextUtils.JexlVCMatchExp> filterExps;
|
||||
List<VariantContextUtils.JexlVCMatchExp> genotypeFilterExps;
|
||||
|
|
@ -215,6 +221,9 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
|
|||
|
||||
for ( VariantContext vc : VCs ) {
|
||||
|
||||
if ( invalidatePrevious ) {
|
||||
vc = (new VariantContextBuilder(vc)).filters(new HashSet<String>()).make();
|
||||
}
|
||||
// filter based on previous mask position
|
||||
if ( previousMaskPosition != null && // we saw a previous mask site
|
||||
previousMaskPosition.getContig().equals(vc.getChr()) && // it's on the same contig
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
|
|
@ -41,10 +41,11 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
|||
|
||||
public enum Model {
|
||||
/** The default model with the best performance in all cases */
|
||||
EXACT,
|
||||
EXACT
|
||||
}
|
||||
|
||||
protected int N;
|
||||
protected int MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
|
||||
|
||||
protected Logger logger;
|
||||
protected PrintStream verboseWriter;
|
||||
|
|
@ -53,20 +54,21 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
|||
|
||||
protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY;
|
||||
|
||||
protected AlleleFrequencyCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
protected AlleleFrequencyCalculationModel(final UnifiedArgumentCollection UAC, final int N, final Logger logger, final PrintStream verboseWriter) {
|
||||
this.N = N;
|
||||
this.MAX_ALTERNATE_ALLELES_TO_GENOTYPE = UAC.MAX_ALTERNATE_ALLELES;
|
||||
this.logger = logger;
|
||||
this.verboseWriter = verboseWriter;
|
||||
}
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
* @param GLs genotype likelihoods
|
||||
* @param Alleles Alleles corresponding to GLs
|
||||
* @param vc variant context with alleles and genotype likelihoods
|
||||
* @param log10AlleleFrequencyPriors priors
|
||||
* @param result (pre-allocated) object to store likelihoods results
|
||||
* @return the alleles used for genotyping
|
||||
*/
|
||||
protected abstract void getLog10PNonRef(GenotypesContext GLs, List<Allele> Alleles,
|
||||
double[][] log10AlleleFrequencyPriors,
|
||||
AlleleFrequencyCalculationResult result);
|
||||
protected abstract List<Allele> getLog10PNonRef(final VariantContext vc,
|
||||
final double[][] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result);
|
||||
}
|
||||
|
|
@ -275,19 +275,22 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
|||
public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
|
||||
byte obsBase = elt.getBase();
|
||||
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
|
||||
if ( qual == 0 )
|
||||
return 0;
|
||||
|
||||
if ( elt.isReducedRead() ) {
|
||||
if ( elt.getRead().isReducedRead() ) {
|
||||
// reduced read representation
|
||||
if ( BaseUtils.isRegularBase( obsBase )) {
|
||||
add(obsBase, qual, (byte)0, (byte)0, elt.getRepresentativeCount()); // fast calculation of n identical likelihoods
|
||||
return elt.getRepresentativeCount(); // we added nObs bases here
|
||||
int representativeCount = elt.getRepresentativeCount();
|
||||
add(obsBase, qual, (byte)0, (byte)0, representativeCount); // fast calculation of n identical likelihoods
|
||||
return representativeCount; // we added nObs bases here
|
||||
}
|
||||
|
||||
// odd bases or deletions => don't use them
|
||||
return 0;
|
||||
}
|
||||
|
||||
return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0;
|
||||
return add(obsBase, qual, (byte)0, (byte)0, 1);
|
||||
}
|
||||
|
||||
public int add(List<PileupElement> overlappingPair, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
|
||||
|
|
@ -519,7 +522,7 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
|||
if ( qual > SAMUtils.MAX_PHRED_SCORE )
|
||||
throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
|
||||
if ( capBaseQualsAtMappingQual )
|
||||
qual = (byte)Math.min((int)p.getQual(), p.getMappingQual());
|
||||
qual = (byte)Math.min((int)qual, p.getMappingQual());
|
||||
if ( (int)qual < minBaseQual )
|
||||
qual = (byte)0;
|
||||
|
||||
|
|
|
|||
|
|
@ -35,25 +35,89 @@ import java.util.*;
|
|||
|
||||
public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||
|
||||
private final static boolean DEBUG = false;
|
||||
// private final static boolean DEBUG = false;
|
||||
|
||||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
|
||||
|
||||
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(UAC, N, logger, verboseWriter);
|
||||
}
|
||||
|
||||
public void getLog10PNonRef(final GenotypesContext GLs,
|
||||
final List<Allele> alleles,
|
||||
final double[][] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
final int numAlleles = alleles.size();
|
||||
public List<Allele> getLog10PNonRef(final VariantContext vc,
|
||||
final double[][] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
GenotypesContext GLs = vc.getGenotypes();
|
||||
List<Allele> alleles = vc.getAlleles();
|
||||
|
||||
// don't try to genotype too many alternate alleles
|
||||
if ( vc.getAlternateAlleles().size() > MAX_ALTERNATE_ALLELES_TO_GENOTYPE ) {
|
||||
logger.warn("this tool is currently set to genotype at most " + MAX_ALTERNATE_ALLELES_TO_GENOTYPE + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
|
||||
|
||||
alleles = new ArrayList<Allele>(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1);
|
||||
alleles.add(vc.getReference());
|
||||
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE));
|
||||
GLs = UnifiedGenotyperEngine.subsetAlleles(vc, alleles, false);
|
||||
}
|
||||
|
||||
//linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
|
||||
linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, result, false);
|
||||
linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result, false);
|
||||
|
||||
return alleles;
|
||||
}
|
||||
|
||||
private static final class LikelihoodSum implements Comparable<LikelihoodSum> {
|
||||
public double sum = 0.0;
|
||||
public Allele allele;
|
||||
|
||||
public LikelihoodSum(Allele allele) { this.allele = allele; }
|
||||
|
||||
public int compareTo(LikelihoodSum other) {
|
||||
final double diff = sum - other.sum;
|
||||
return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
private static final int PL_INDEX_OF_HOM_REF = 0;
|
||||
private static final List<Allele> chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) {
|
||||
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
|
||||
final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles];
|
||||
for ( int i = 0; i < numOriginalAltAlleles; i++ )
|
||||
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
|
||||
|
||||
// make sure that we've cached enough data
|
||||
if ( numOriginalAltAlleles > UnifiedGenotyperEngine.PLIndexToAlleleIndex.length - 1 )
|
||||
UnifiedGenotyperEngine.calculatePLcache(numOriginalAltAlleles);
|
||||
|
||||
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
|
||||
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes());
|
||||
for ( final double[] likelihoods : GLs ) {
|
||||
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
|
||||
if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) {
|
||||
int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[numOriginalAltAlleles][PLindexOfBestGL];
|
||||
if ( alleles[0] != 0 )
|
||||
likelihoodSums[alleles[0]-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
|
||||
// don't double-count it
|
||||
if ( alleles[1] != 0 && alleles[1] != alleles[0] )
|
||||
likelihoodSums[alleles[1]-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
|
||||
}
|
||||
}
|
||||
|
||||
// sort them by probability mass and choose the best ones
|
||||
Collections.sort(Arrays.asList(likelihoodSums));
|
||||
final ArrayList<Allele> bestAlleles = new ArrayList<Allele>(numAllelesToChoose);
|
||||
for ( int i = 0; i < numAllelesToChoose; i++ )
|
||||
bestAlleles.add(likelihoodSums[i].allele);
|
||||
|
||||
final ArrayList<Allele> orderedBestAlleles = new ArrayList<Allele>(numAllelesToChoose);
|
||||
for ( Allele allele : vc.getAlternateAlleles() ) {
|
||||
if ( bestAlleles.contains(allele) )
|
||||
orderedBestAlleles.add(allele);
|
||||
}
|
||||
|
||||
return orderedBestAlleles;
|
||||
}
|
||||
|
||||
private static final ArrayList<double[]> getGLs(GenotypesContext GLs) {
|
||||
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>(GLs.size());
|
||||
|
||||
|
|
@ -70,47 +134,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
return genotypeLikelihoods;
|
||||
}
|
||||
|
||||
|
||||
final static double approximateLog10SumLog10(double[] vals) {
|
||||
if ( vals.length < 2 )
|
||||
throw new ReviewedStingException("Passing array with fewer than 2 values when computing approximateLog10SumLog10");
|
||||
|
||||
double approx = approximateLog10SumLog10(vals[0], vals[1]);
|
||||
for ( int i = 2; i < vals.length; i++ )
|
||||
approx = approximateLog10SumLog10(approx, vals[i]);
|
||||
return approx;
|
||||
}
|
||||
|
||||
final static double approximateLog10SumLog10(double small, double big) {
|
||||
// make sure small is really the smaller value
|
||||
if ( small > big ) {
|
||||
final double t = big;
|
||||
big = small;
|
||||
small = t;
|
||||
}
|
||||
|
||||
if (small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY )
|
||||
return big;
|
||||
|
||||
if (big >= small + MathUtils.MAX_JACOBIAN_TOLERANCE)
|
||||
return big;
|
||||
|
||||
// OK, so |y-x| < tol: we use the following identity then:
|
||||
// we need to compute log10(10^x + 10^y)
|
||||
// By Jacobian logarithm identity, this is equal to
|
||||
// max(x,y) + log10(1+10^-abs(x-y))
|
||||
// we compute the second term as a table lookup
|
||||
// with integer quantization
|
||||
// we have pre-stored correction for 0,0.1,0.2,... 10.0
|
||||
//final int ind = (int)(((big-small)/JACOBIAN_LOG_TABLE_STEP)); // hard rounding
|
||||
int ind = (int)(Math.round((big-small)/MathUtils.JACOBIAN_LOG_TABLE_STEP)); // hard rounding
|
||||
|
||||
//double z =Math.log10(1+Math.pow(10.0,-diff));
|
||||
//System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
|
||||
return big + MathUtils.jacobianLogTable[ind];
|
||||
}
|
||||
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// Multi-allelic implementation.
|
||||
|
|
@ -207,7 +230,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
final int numChr = 2*numSamples;
|
||||
|
||||
// queue of AC conformations to process
|
||||
final Queue<ExactACset> ACqueue = new LinkedList<ExactACset>();
|
||||
final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
|
||||
|
||||
// mapping of ExactACset indexes to the objects
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>(numChr+1);
|
||||
|
|
@ -218,40 +241,57 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
ACqueue.add(zeroSet);
|
||||
indexesToACset.put(zeroSet.ACcounts, zeroSet);
|
||||
|
||||
// optimization: create the temporary storage for computing L(j,k) just once
|
||||
final int maxPossibleDependencies = numAlternateAlleles + (numAlternateAlleles * (numAlternateAlleles + 1) / 2) + 1;
|
||||
final double[][] tempLog10ConformationLikelihoods = new double[numSamples+1][maxPossibleDependencies];
|
||||
for ( int i = 0; i < maxPossibleDependencies; i++ )
|
||||
tempLog10ConformationLikelihoods[0][i] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
// keep processing while we have AC conformations that need to be calculated
|
||||
double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
// compute log10Likelihoods
|
||||
final ExactACset set = ACqueue.remove();
|
||||
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, preserveData, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result);
|
||||
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, preserveData, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result, tempLog10ConformationLikelihoods);
|
||||
|
||||
// adjust max likelihood seen if needed
|
||||
maxLog10L = Math.max(maxLog10L, log10LofKs);
|
||||
}
|
||||
}
|
||||
|
||||
private static final class DependentSet {
|
||||
public final int[] ACcounts;
|
||||
public final int PLindex;
|
||||
|
||||
public DependentSet(final int[] ACcounts, final int PLindex) {
|
||||
this.ACcounts = ACcounts;
|
||||
this.PLindex = PLindex;
|
||||
}
|
||||
}
|
||||
|
||||
private static double calculateAlleleCountConformation(final ExactACset set,
|
||||
final ArrayList<double[]> genotypeLikelihoods,
|
||||
final double maxLog10L,
|
||||
final int numChr,
|
||||
final boolean preserveData,
|
||||
final Queue<ExactACset> ACqueue,
|
||||
final LinkedList<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||
final double[][] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
final AlleleFrequencyCalculationResult result,
|
||||
final double[][] tempLog10ConformationLikelihoods) {
|
||||
|
||||
if ( DEBUG )
|
||||
System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
|
||||
|
||||
// compute the log10Likelihoods
|
||||
computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, result);
|
||||
computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, result, tempLog10ConformationLikelihoods);
|
||||
|
||||
// clean up memory
|
||||
if ( !preserveData ) {
|
||||
for ( ExactACcounts index : set.dependentACsetsToDelete ) {
|
||||
indexesToACset.put(index, null);
|
||||
if ( DEBUG )
|
||||
System.out.printf(" *** removing used set=%s after seeing final dependent set=%s%n", index, set.ACcounts);
|
||||
indexesToACset.remove(index);
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** removing used set=%s after seeing final dependent set=%s%n", index, set.ACcounts);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -259,12 +299,12 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
|
||||
// can we abort early because the log10Likelihoods are so small?
|
||||
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
if ( DEBUG )
|
||||
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
||||
|
||||
// no reason to keep this data around because nothing depends on it
|
||||
if ( !preserveData )
|
||||
indexesToACset.put(set.ACcounts, null);
|
||||
indexesToACset.remove(set.ACcounts);
|
||||
|
||||
return log10LofK;
|
||||
}
|
||||
|
|
@ -274,7 +314,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
|
||||
return log10LofK;
|
||||
|
||||
ExactACset lastSet = null; // keep track of the last set placed in the queue so that we can tell it to clean us up when done processing
|
||||
final int numAltAlleles = set.ACcounts.getCounts().length;
|
||||
|
||||
// genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods.
|
||||
|
|
@ -285,30 +324,40 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
for ( int allele = 0; allele < numAltAlleles; allele++ ) {
|
||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||
ACcountsClone[allele]++;
|
||||
lastSet = updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset);
|
||||
updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset);
|
||||
}
|
||||
|
||||
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
|
||||
if ( ACwiggle > 1 ) {
|
||||
final ArrayList<DependentSet> differentAlleles = new ArrayList<DependentSet>(numAltAlleles * numAltAlleles);
|
||||
final ArrayList<DependentSet> sameAlleles = new ArrayList<DependentSet>(numAltAlleles);
|
||||
|
||||
for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) {
|
||||
for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) {
|
||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||
ACcountsClone[allele_i]++;
|
||||
ACcountsClone[allele_j]++;
|
||||
lastSet = updateACset(ACcountsClone, numChr, set, ++PLindex , ACqueue, indexesToACset);
|
||||
|
||||
if ( allele_i == allele_j )
|
||||
sameAlleles.add(new DependentSet(ACcountsClone, ++PLindex));
|
||||
else
|
||||
differentAlleles.add(new DependentSet(ACcountsClone, ++PLindex));
|
||||
}
|
||||
}
|
||||
|
||||
// IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering
|
||||
for ( DependentSet dependent : differentAlleles )
|
||||
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset);
|
||||
for ( DependentSet dependent : sameAlleles )
|
||||
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset);
|
||||
}
|
||||
|
||||
// if the last dependent set was not at the back of the queue (i.e. not just added), then we need to iterate
|
||||
// over all the dependent sets to find the last one in the queue (otherwise it will be cleaned up too early)
|
||||
if ( !preserveData && lastSet == null ) {
|
||||
if ( DEBUG )
|
||||
System.out.printf(" *** iterating over dependent sets for set=%s%n", set.ACcounts);
|
||||
lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue);
|
||||
// determine which is the last dependent set in the queue (not necessarily the last one added above) so we can know when it is safe to clean up this column
|
||||
if ( !preserveData ) {
|
||||
final ExactACset lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue);
|
||||
if ( lastSet != null )
|
||||
lastSet.dependentACsetsToDelete.add(set.ACcounts);
|
||||
}
|
||||
if ( lastSet != null )
|
||||
lastSet.dependentACsetsToDelete.add(set.ACcounts);
|
||||
|
||||
return log10LofK;
|
||||
}
|
||||
|
|
@ -316,41 +365,44 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
|
||||
// also adds it as a dependency to the given callingSetIndex.
|
||||
// returns the ExactACset if that set was not already in the queue and null otherwise.
|
||||
private static ExactACset updateACset(final int[] ACcounts,
|
||||
final int numChr,
|
||||
final ExactACset callingSet,
|
||||
final int PLsetIndex,
|
||||
final Queue<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
|
||||
private static void updateACset(final int[] ACcounts,
|
||||
final int numChr,
|
||||
final ExactACset callingSet,
|
||||
final int PLsetIndex,
|
||||
final Queue<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
|
||||
final ExactACcounts index = new ExactACcounts(ACcounts);
|
||||
boolean wasInQueue = true;
|
||||
if ( !indexesToACset.containsKey(index) ) {
|
||||
ExactACset set = new ExactACset(numChr/2 +1, index);
|
||||
indexesToACset.put(index, set);
|
||||
ACqueue.add(set);
|
||||
wasInQueue = false;
|
||||
}
|
||||
|
||||
// add the given dependency to the set
|
||||
//if ( DEBUG )
|
||||
// System.out.println(" *** adding dependency from " + index + " to " + callingSet.ACcounts);
|
||||
final ExactACset set = indexesToACset.get(index);
|
||||
set.ACsetIndexToPLIndex.put(callingSet.ACcounts, PLsetIndex);
|
||||
return wasInQueue ? null : set;
|
||||
}
|
||||
|
||||
private static ExactACset determineLastDependentSetInQueue(final ExactACcounts callingSetIndex, final Queue<ExactACset> ACqueue) {
|
||||
ExactACset set = null;
|
||||
for ( ExactACset queued : ACqueue ) {
|
||||
if ( queued.dependentACsetsToDelete.contains(callingSetIndex) )
|
||||
set = queued;
|
||||
private static ExactACset determineLastDependentSetInQueue(final ExactACcounts callingSetIndex, final LinkedList<ExactACset> ACqueue) {
|
||||
Iterator<ExactACset> reverseIterator = ACqueue.descendingIterator();
|
||||
while ( reverseIterator.hasNext() ) {
|
||||
final ExactACset queued = reverseIterator.next();
|
||||
if ( queued.ACsetIndexToPLIndex.containsKey(callingSetIndex) )
|
||||
return queued;
|
||||
}
|
||||
return set;
|
||||
|
||||
// shouldn't get here
|
||||
throw new ReviewedStingException("Error: no sets in the queue currently hold " + callingSetIndex + " as a dependent!");
|
||||
}
|
||||
|
||||
private static void computeLofK(final ExactACset set,
|
||||
final ArrayList<double[]> genotypeLikelihoods,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||
final double[][] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
final AlleleFrequencyCalculationResult result,
|
||||
final double[][] tempLog10ConformationLikelihoods) {
|
||||
|
||||
set.log10Likelihoods[0] = 0.0; // the zero case
|
||||
final int totalK = set.getACsum();
|
||||
|
|
@ -362,38 +414,41 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
}
|
||||
// k > 0 for at least one k
|
||||
else {
|
||||
// all possible likelihoods for a given cell from which to choose the max
|
||||
final int numPaths = set.ACsetIndexToPLIndex.size() + 1;
|
||||
final double[] log10ConformationLikelihoods = new double[numPaths]; // TODO can be created just once, since you initialize it
|
||||
// deal with the non-AA possible conformations
|
||||
int conformationIndex = 1;
|
||||
for ( Map.Entry<ExactACcounts, Integer> mapping : set.ACsetIndexToPLIndex.entrySet() ) {
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey());
|
||||
|
||||
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||
ExactACset dependent = indexesToACset.get(mapping.getKey());
|
||||
|
||||
// initialize
|
||||
for ( int i = 0; i < numPaths; i++ )
|
||||
// TODO -- Arrays.fill?
|
||||
// todo -- is this even necessary? Why not have as else below?
|
||||
log10ConformationLikelihoods[i] = Double.NEGATIVE_INFINITY;
|
||||
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
|
||||
|
||||
// deal with the AA case first
|
||||
if ( totalK < 2*j-1 )
|
||||
log10ConformationLikelihoods[0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
|
||||
|
||||
// deal with the other possible conformations now
|
||||
if ( totalK <= 2*j ) { // skip impossible conformations
|
||||
int conformationIndex = 1;
|
||||
for ( Map.Entry<ExactACcounts, Integer> mapping : set.ACsetIndexToPLIndex.entrySet() ) {
|
||||
if ( DEBUG )
|
||||
System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey());
|
||||
log10ConformationLikelihoods[conformationIndex++] =
|
||||
determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + indexesToACset.get(mapping.getKey()).log10Likelihoods[j-1] + gl[mapping.getValue()];
|
||||
if ( totalK <= 2*j ) { // skip impossible conformations
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
tempLog10ConformationLikelihoods[j][conformationIndex] =
|
||||
determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + dependent.log10Likelihoods[j-1] + gl[mapping.getValue()];
|
||||
} else {
|
||||
tempLog10ConformationLikelihoods[j][conformationIndex] = Double.NEGATIVE_INFINITY;
|
||||
}
|
||||
}
|
||||
|
||||
final double log10Max = approximateLog10SumLog10(log10ConformationLikelihoods);
|
||||
conformationIndex++;
|
||||
}
|
||||
|
||||
// finally, update the L(j,k) value
|
||||
// finally, deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
|
||||
final int numPaths = set.ACsetIndexToPLIndex.size() + 1;
|
||||
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
|
||||
|
||||
if ( totalK < 2*j-1 ) {
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
tempLog10ConformationLikelihoods[j][0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
|
||||
} else {
|
||||
tempLog10ConformationLikelihoods[j][0] = Double.NEGATIVE_INFINITY;
|
||||
}
|
||||
|
||||
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||
final double log10Max = MathUtils.approximateLog10SumLog10(tempLog10ConformationLikelihoods[j], numPaths);
|
||||
set.log10Likelihoods[j] = log10Max - logDenominator;
|
||||
}
|
||||
}
|
||||
|
|
@ -415,10 +470,10 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
// update the likelihoods/posteriors vectors which are collapsed views of each of the various ACs
|
||||
for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) {
|
||||
int AC = set.ACcounts.getCounts()[i];
|
||||
result.log10AlleleFrequencyLikelihoods[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyLikelihoods[i][AC], log10LofK);
|
||||
result.log10AlleleFrequencyLikelihoods[i][AC] = MathUtils.approximateLog10SumLog10(result.log10AlleleFrequencyLikelihoods[i][AC], log10LofK);
|
||||
|
||||
final double prior = log10AlleleFrequencyPriors[nonRefAlleles-1][AC];
|
||||
result.log10AlleleFrequencyPosteriors[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior);
|
||||
result.log10AlleleFrequencyPosteriors[i][AC] = MathUtils.approximateLog10SumLog10(result.log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -564,7 +619,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
lastK = k;
|
||||
maxLog10L = Math.max(maxLog10L, log10LofK);
|
||||
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L);
|
||||
//if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L);
|
||||
done = true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -31,12 +31,14 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
|||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
|
|
@ -72,25 +74,28 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
|
|||
this.logger = logger;
|
||||
}
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
*
|
||||
* @param tracker rod data
|
||||
* @param ref reference context
|
||||
* @param contexts stratified alignment contexts
|
||||
* @param contextType stratified context type
|
||||
* @param priors priors to use for GLs
|
||||
* @param alternateAlleleToUse the alternate allele to use, null if not set
|
||||
* @param useBAQedPileup should we use the BAQed pileup or the raw one?
|
||||
* @return variant context where genotypes are no-called but with GLs
|
||||
*/
|
||||
public abstract VariantContext getLikelihoods(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
AlignmentContextUtils.ReadOrientation contextType,
|
||||
GenotypePriors priors,
|
||||
Allele alternateAlleleToUse,
|
||||
boolean useBAQedPileup);
|
||||
/**
|
||||
* Can be overridden by concrete subclasses
|
||||
*
|
||||
* @param tracker rod data
|
||||
* @param ref reference context
|
||||
* @param contexts stratified alignment contexts
|
||||
* @param contextType stratified context type
|
||||
* @param priors priors to use for GLs
|
||||
* @param alternateAllelesToUse the alternate allele to use, null if not set
|
||||
* @param useBAQedPileup should we use the BAQed pileup or the raw one?
|
||||
* @param locParser Genome Loc Parser
|
||||
* @return variant context where genotypes are no-called but with GLs
|
||||
*/
|
||||
public abstract VariantContext getLikelihoods(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final Map<String, AlignmentContext> contexts,
|
||||
final AlignmentContextUtils.ReadOrientation contextType,
|
||||
final GenotypePriors priors,
|
||||
final List<Allele> alternateAllelesToUse,
|
||||
final boolean useBAQedPileup,
|
||||
final GenomeLocParser locParser);
|
||||
|
||||
|
||||
protected int getFilteredDepth(ReadBackedPileup pileup) {
|
||||
int count = 0;
|
||||
|
|
|
|||
|
|
@ -33,9 +33,11 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
|
|
@ -54,19 +56,18 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
private final boolean getAlleleListFromVCF;
|
||||
|
||||
private boolean DEBUG = false;
|
||||
|
||||
private final boolean doMultiAllelicCalls = true;
|
||||
private boolean ignoreSNPAllelesWhenGenotypingIndels = false;
|
||||
|
||||
private PairHMMIndelErrorModel pairModel;
|
||||
|
||||
private static ThreadLocal<HashMap<PileupElement,LinkedHashMap<Allele,Double>>> indelLikelihoodMap =
|
||||
new ThreadLocal<HashMap<PileupElement,LinkedHashMap<Allele,Double>>>() {
|
||||
protected synchronized HashMap<PileupElement,LinkedHashMap<Allele,Double>> initialValue() {
|
||||
return new HashMap<PileupElement,LinkedHashMap<Allele,Double>>();
|
||||
}
|
||||
};
|
||||
private static ThreadLocal<HashMap<PileupElement, LinkedHashMap<Allele, Double>>> indelLikelihoodMap =
|
||||
new ThreadLocal<HashMap<PileupElement, LinkedHashMap<Allele, Double>>>() {
|
||||
protected synchronized HashMap<PileupElement, LinkedHashMap<Allele, Double>> initialValue() {
|
||||
return new HashMap<PileupElement, LinkedHashMap<Allele, Double>>();
|
||||
}
|
||||
};
|
||||
|
||||
private LinkedHashMap<Allele,Haplotype> haplotypeMap;
|
||||
private LinkedHashMap<Allele, Haplotype> haplotypeMap;
|
||||
|
||||
// gdebug removeme
|
||||
// todo -cleanup
|
||||
|
|
@ -74,37 +75,37 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
private ArrayList<Allele> alleleList;
|
||||
|
||||
static {
|
||||
indelLikelihoodMap.set(new HashMap<PileupElement,LinkedHashMap<Allele,Double>>());
|
||||
indelLikelihoodMap.set(new HashMap<PileupElement, LinkedHashMap<Allele, Double>>());
|
||||
}
|
||||
|
||||
|
||||
protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||
super(UAC, logger);
|
||||
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY,
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.BANDED_INDEL_COMPUTATION);
|
||||
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
|
||||
alleleList = new ArrayList<Allele>();
|
||||
getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
|
||||
minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING;
|
||||
HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE;
|
||||
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
|
||||
|
||||
haplotypeMap = new LinkedHashMap<Allele,Haplotype>();
|
||||
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
|
||||
ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES;
|
||||
}
|
||||
|
||||
|
||||
private ArrayList<Allele> computeConsensusAlleles(ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
AlignmentContextUtils.ReadOrientation contextType) {
|
||||
Allele refAllele=null, altAllele=null;
|
||||
AlignmentContextUtils.ReadOrientation contextType, GenomeLocParser locParser) {
|
||||
Allele refAllele = null, altAllele = null;
|
||||
GenomeLoc loc = ref.getLocus();
|
||||
ArrayList<Allele> aList = new ArrayList<Allele>();
|
||||
|
||||
HashMap<String,Integer> consensusIndelStrings = new HashMap<String,Integer>();
|
||||
HashMap<String, Integer> consensusIndelStrings = new HashMap<String, Integer>();
|
||||
|
||||
int insCount = 0, delCount = 0;
|
||||
// quick check of total number of indels in pileup
|
||||
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
|
||||
for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) {
|
||||
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
|
||||
|
||||
final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup();
|
||||
|
|
@ -114,22 +115,20 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
|
||||
if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping)
|
||||
return aList;
|
||||
|
||||
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
|
||||
|
||||
for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) {
|
||||
// todo -- warning, can be duplicating expensive partition here
|
||||
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
|
||||
|
||||
final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup();
|
||||
|
||||
|
||||
|
||||
|
||||
for ( ExtendedEventPileupElement p : indelPileup.toExtendedIterable() ) {
|
||||
for (ExtendedEventPileupElement p : indelPileup.toExtendedIterable()) {
|
||||
//SAMRecord read = p.getRead();
|
||||
GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
|
||||
GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
|
||||
if (read == null)
|
||||
continue;
|
||||
if(ReadUtils.is454Read(read)) {
|
||||
continue;
|
||||
if (ReadUtils.is454Read(read)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -143,60 +142,69 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
String indelString = p.getEventBases();
|
||||
if (p.isInsertion()) {
|
||||
boolean foundKey = false;
|
||||
// copy of hashmap into temp arrayList
|
||||
ArrayList<Pair<String,Integer>> cList = new ArrayList<Pair<String,Integer>>();
|
||||
for (String s : consensusIndelStrings.keySet()) {
|
||||
cList.add(new Pair<String, Integer>(s,consensusIndelStrings.get(s)));
|
||||
}
|
||||
|
||||
if (read.getAlignmentEnd() == loc.getStart()) {
|
||||
// first corner condition: a read has an insertion at the end, and we're right at the insertion.
|
||||
// In this case, the read could have any of the inserted bases and we need to build a consensus
|
||||
for (String s : consensusIndelStrings.keySet()) {
|
||||
int cnt = consensusIndelStrings.get(s);
|
||||
if (s.startsWith(indelString)){
|
||||
// case 1: current insertion is prefix of indel in hash map
|
||||
consensusIndelStrings.put(s,cnt+1);
|
||||
|
||||
for (int k=0; k < cList.size(); k++) {
|
||||
String s = cList.get(k).getFirst();
|
||||
int cnt = cList.get(k).getSecond();
|
||||
// case 1: current insertion is prefix of indel in hash map
|
||||
if (s.startsWith(indelString)) {
|
||||
cList.set(k,new Pair<String, Integer>(s,cnt+1));
|
||||
foundKey = true;
|
||||
break;
|
||||
}
|
||||
else if (indelString.startsWith(s)) {
|
||||
// case 2: indel stored in hash table is prefix of current insertion
|
||||
// In this case, new bases are new key.
|
||||
consensusIndelStrings.remove(s);
|
||||
consensusIndelStrings.put(indelString,cnt+1);
|
||||
foundKey = true;
|
||||
break;
|
||||
cList.set(k,new Pair<String, Integer>(indelString,cnt+1));
|
||||
}
|
||||
}
|
||||
if (!foundKey)
|
||||
// none of the above: event bases not supported by previous table, so add new key
|
||||
consensusIndelStrings.put(indelString,1);
|
||||
cList.add(new Pair<String, Integer>(indelString,1));
|
||||
|
||||
}
|
||||
else if (read.getAlignmentStart() == loc.getStart()+1) {
|
||||
// opposite corner condition: read will start at current locus with an insertion
|
||||
for (String s : consensusIndelStrings.keySet()) {
|
||||
int cnt = consensusIndelStrings.get(s);
|
||||
if (s.endsWith(indelString)){
|
||||
// case 1: current insertion is suffix of indel in hash map
|
||||
consensusIndelStrings.put(s,cnt+1);
|
||||
for (int k=0; k < cList.size(); k++) {
|
||||
String s = cList.get(k).getFirst();
|
||||
int cnt = cList.get(k).getSecond();
|
||||
if (s.endsWith(indelString)) {
|
||||
// case 1: current insertion (indelString) is suffix of indel in hash map (s)
|
||||
cList.set(k,new Pair<String, Integer>(s,cnt+1));
|
||||
foundKey = true;
|
||||
break;
|
||||
}
|
||||
else if (indelString.endsWith(s)) {
|
||||
// case 2: indel stored in hash table is suffix of current insertion
|
||||
// case 2: indel stored in hash table is prefix of current insertion
|
||||
// In this case, new bases are new key.
|
||||
|
||||
consensusIndelStrings.remove(s);
|
||||
consensusIndelStrings.put(indelString,cnt+1);
|
||||
foundKey = true;
|
||||
break;
|
||||
cList.set(k,new Pair<String, Integer>(indelString,cnt+1));
|
||||
}
|
||||
}
|
||||
if (!foundKey)
|
||||
// none of the above: event bases not supported by previous table, so add new key
|
||||
consensusIndelStrings.put(indelString,1);
|
||||
cList.add(new Pair<String, Integer>(indelString,1));
|
||||
|
||||
|
||||
}
|
||||
else {
|
||||
// normal case: insertion somewhere in the middle of a read: add count to hash map
|
||||
// normal case: insertion somewhere in the middle of a read: add count to arrayList
|
||||
int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0;
|
||||
consensusIndelStrings.put(indelString,cnt+1);
|
||||
cList.add(new Pair<String, Integer>(indelString,cnt+1));
|
||||
}
|
||||
|
||||
// copy back arrayList into hashMap
|
||||
consensusIndelStrings.clear();
|
||||
for (Pair<String,Integer> pair : cList) {
|
||||
consensusIndelStrings.put(pair.getFirst(),pair.getSecond());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -208,78 +216,84 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
}
|
||||
}
|
||||
|
||||
/* if (DEBUG) {
|
||||
int icount = indelPileup.getNumberOfInsertions();
|
||||
int dcount = indelPileup.getNumberOfDeletions();
|
||||
if (icount + dcount > 0)
|
||||
{
|
||||
List<Pair<String,Integer>> eventStrings = indelPileup.getEventStringsWithCounts(ref.getBases());
|
||||
System.out.format("#ins: %d, #del:%d\n", insCount, delCount);
|
||||
|
||||
for (int i=0 ; i < eventStrings.size() ; i++ ) {
|
||||
System.out.format("%s:%d,",eventStrings.get(i).first,eventStrings.get(i).second);
|
||||
// int k=0;
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
} */
|
||||
}
|
||||
|
||||
Collection<VariantContext> vcs = new ArrayList<VariantContext>();
|
||||
int maxAlleleCnt = 0;
|
||||
String bestAltAllele = "";
|
||||
|
||||
for (String s : consensusIndelStrings.keySet()) {
|
||||
int curCnt = consensusIndelStrings.get(s);
|
||||
if (curCnt > maxAlleleCnt) {
|
||||
maxAlleleCnt = curCnt;
|
||||
bestAltAllele = s;
|
||||
int curCnt = consensusIndelStrings.get(s), stop = 0;
|
||||
// if observed count if above minimum threshold, we will genotype this allele
|
||||
if (curCnt < minIndelCountForGenotyping)
|
||||
continue;
|
||||
|
||||
if (s.startsWith("D")) {
|
||||
// get deletion length
|
||||
int dLen = Integer.valueOf(s.substring(1));
|
||||
// get ref bases of accurate deletion
|
||||
int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart();
|
||||
stop = loc.getStart() + dLen;
|
||||
byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen);
|
||||
|
||||
if (Allele.acceptableAlleleBases(refBases)) {
|
||||
refAllele = Allele.create(refBases, true);
|
||||
altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false);
|
||||
}
|
||||
} else {
|
||||
// insertion case
|
||||
if (Allele.acceptableAlleleBases(s)) {
|
||||
refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true);
|
||||
altAllele = Allele.create(s, false);
|
||||
stop = loc.getStart();
|
||||
}
|
||||
}
|
||||
// if (DEBUG)
|
||||
// System.out.format("Key:%s, number: %d\n",s,consensusIndelStrings.get(s) );
|
||||
} //gdebug-
|
||||
|
||||
if (maxAlleleCnt < minIndelCountForGenotyping)
|
||||
return aList;
|
||||
|
||||
if (bestAltAllele.startsWith("D")) {
|
||||
// get deletion length
|
||||
int dLen = Integer.valueOf(bestAltAllele.substring(1));
|
||||
// get ref bases of accurate deletion
|
||||
int startIdxInReference = 1+loc.getStart()-ref.getWindow().getStart();
|
||||
ArrayList vcAlleles = new ArrayList<Allele>();
|
||||
vcAlleles.add(refAllele);
|
||||
vcAlleles.add(altAllele);
|
||||
|
||||
//System.out.println(new String(ref.getBases()));
|
||||
byte[] refBases = Arrays.copyOfRange(ref.getBases(),startIdxInReference,startIdxInReference+dLen);
|
||||
final VariantContextBuilder builder = new VariantContextBuilder().source("");
|
||||
builder.loc(loc.getContig(), loc.getStart(), stop);
|
||||
builder.alleles(vcAlleles);
|
||||
builder.referenceBaseForIndel(ref.getBase());
|
||||
builder.noGenotypes();
|
||||
if (doMultiAllelicCalls)
|
||||
vcs.add(builder.make());
|
||||
else {
|
||||
if (curCnt > maxAlleleCnt) {
|
||||
maxAlleleCnt = curCnt;
|
||||
vcs.clear();
|
||||
vcs.add(builder.make());
|
||||
}
|
||||
|
||||
if (Allele.acceptableAlleleBases(refBases)) {
|
||||
refAllele = Allele.create(refBases,true);
|
||||
altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// insertion case
|
||||
if (Allele.acceptableAlleleBases(bestAltAllele)) {
|
||||
refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true);
|
||||
altAllele = Allele.create(bestAltAllele, false);
|
||||
}
|
||||
}
|
||||
if (refAllele != null && altAllele != null) {
|
||||
aList.add(0,refAllele);
|
||||
aList.add(1,altAllele);
|
||||
}
|
||||
|
||||
if (vcs.isEmpty())
|
||||
return aList; // nothing else to do, no alleles passed minimum count criterion
|
||||
|
||||
VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false);
|
||||
|
||||
aList = new ArrayList<Allele>(mergedVC.getAlleles());
|
||||
|
||||
return aList;
|
||||
|
||||
}
|
||||
|
||||
private final static EnumSet<VariantContext.Type> allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED);
|
||||
|
||||
public VariantContext getLikelihoods(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
AlignmentContextUtils.ReadOrientation contextType,
|
||||
GenotypePriors priors,
|
||||
Allele alternateAlleleToUse,
|
||||
boolean useBAQedPileup) {
|
||||
public VariantContext getLikelihoods(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final Map<String, AlignmentContext> contexts,
|
||||
final AlignmentContextUtils.ReadOrientation contextType,
|
||||
final GenotypePriors priors,
|
||||
final List<Allele> alternateAllelesToUse,
|
||||
final boolean useBAQedPileup,
|
||||
final GenomeLocParser locParser) {
|
||||
|
||||
if ( tracker == null )
|
||||
if (tracker == null)
|
||||
return null;
|
||||
|
||||
GenomeLoc loc = ref.getLocus();
|
||||
|
|
@ -290,21 +304,21 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
// starting a new site: clear allele list
|
||||
alleleList.clear();
|
||||
lastSiteVisited = ref.getLocus();
|
||||
indelLikelihoodMap.set(new HashMap<PileupElement,LinkedHashMap<Allele,Double>>());
|
||||
indelLikelihoodMap.set(new HashMap<PileupElement, LinkedHashMap<Allele, Double>>());
|
||||
haplotypeMap.clear();
|
||||
|
||||
if (getAlleleListFromVCF) {
|
||||
for( final VariantContext vc_input : tracker.getValues(UAC.alleles, loc) ) {
|
||||
if( vc_input != null &&
|
||||
allowableTypes.contains(vc_input.getType()) &&
|
||||
ref.getLocus().getStart() == vc_input.getStart()) {
|
||||
vc = vc_input;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// ignore places where we don't have a variant
|
||||
if ( vc == null )
|
||||
return null;
|
||||
for (final VariantContext vc_input : tracker.getValues(UAC.alleles, loc)) {
|
||||
if (vc_input != null &&
|
||||
allowableTypes.contains(vc_input.getType()) &&
|
||||
ref.getLocus().getStart() == vc_input.getStart()) {
|
||||
vc = vc_input;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// ignore places where we don't have a variant
|
||||
if (vc == null)
|
||||
return null;
|
||||
|
||||
alleleList.clear();
|
||||
if (ignoreSNPAllelesWhenGenotypingIndels) {
|
||||
|
|
@ -315,15 +329,13 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
else
|
||||
alleleList.add(a);
|
||||
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
for (Allele a : vc.getAlleles())
|
||||
alleleList.add(a);
|
||||
}
|
||||
|
||||
}
|
||||
else {
|
||||
alleleList = computeConsensusAlleles(ref,contexts, contextType);
|
||||
} else {
|
||||
alleleList = computeConsensusAlleles(ref, contexts, contextType, locParser);
|
||||
if (alleleList.isEmpty())
|
||||
return null;
|
||||
}
|
||||
|
|
@ -333,21 +345,21 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
return null;
|
||||
|
||||
// check if there is enough reference window to create haplotypes (can be an issue at end of contigs)
|
||||
if (ref.getWindow().getStop() < loc.getStop()+HAPLOTYPE_SIZE)
|
||||
if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE)
|
||||
return null;
|
||||
if ( !(priors instanceof DiploidIndelGenotypePriors) )
|
||||
throw new StingException("Only diploid-based Indel priors are supported in the DINDEL GL model");
|
||||
if (!(priors instanceof DiploidIndelGenotypePriors))
|
||||
throw new StingException("Only diploid-based Indel priors are supported in the INDEL GL model");
|
||||
|
||||
if (alleleList.isEmpty())
|
||||
return null;
|
||||
|
||||
|
||||
refAllele = alleleList.get(0);
|
||||
altAllele = alleleList.get(1);
|
||||
|
||||
// look for alt allele that has biggest length distance to ref allele
|
||||
int maxLenDiff = 0;
|
||||
for (Allele a: alleleList) {
|
||||
if(a.isNonReference()) {
|
||||
for (Allele a : alleleList) {
|
||||
if (a.isNonReference()) {
|
||||
int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length());
|
||||
if (lenDiff > maxLenDiff) {
|
||||
maxLenDiff = lenDiff;
|
||||
|
|
@ -357,11 +369,11 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
}
|
||||
|
||||
final int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length();
|
||||
final int hsize = (int)ref.getWindow().size()-Math.abs(eventLength)-1;
|
||||
final int numPrefBases= ref.getLocus().getStart()-ref.getWindow().getStart()+1;
|
||||
final int hsize = (int) ref.getWindow().size() - Math.abs(eventLength) - 1;
|
||||
final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1;
|
||||
|
||||
if (hsize <=0) {
|
||||
logger.warn(String.format("Warning: event at location %s can't be genotyped, skipping",loc.toString()));
|
||||
if (hsize <= 0) {
|
||||
logger.warn(String.format("Warning: event at location %s can't be genotyped, skipping", loc.toString()));
|
||||
return null;
|
||||
}
|
||||
haplotypeMap = Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(),
|
||||
|
|
@ -379,7 +391,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
// For each sample, get genotype likelihoods based on pileup
|
||||
// compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them.
|
||||
|
||||
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
|
||||
for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) {
|
||||
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
|
||||
|
||||
ReadBackedPileup pileup = null;
|
||||
|
|
@ -388,8 +400,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
else if (context.hasBasePileup())
|
||||
pileup = context.getBasePileup();
|
||||
|
||||
if (pileup != null ) {
|
||||
final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
|
||||
if (pileup != null) {
|
||||
final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
|
||||
GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(genotypeLikelihoods);
|
||||
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
|
|
@ -398,9 +410,9 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false));
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.format("Sample:%s Alleles:%s GL:",sample.getKey(), alleleList.toString());
|
||||
for (int k=0; k < genotypeLikelihoods.length; k++)
|
||||
System.out.format("%1.4f ",genotypeLikelihoods[k]);
|
||||
System.out.format("Sample:%s Alleles:%s GL:", sample.getKey(), alleleList.toString());
|
||||
for (int k = 0; k < genotypeLikelihoods.length; k++)
|
||||
System.out.format("%1.4f ", genotypeLikelihoods[k]);
|
||||
System.out.println();
|
||||
}
|
||||
}
|
||||
|
|
@ -412,21 +424,21 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
private int calculateEndPos(Collection<Allele> alleles, Allele refAllele, GenomeLoc loc) {
|
||||
// for indels, stop location is one more than ref allele length
|
||||
boolean hasNullAltAllele = false;
|
||||
for ( Allele a : alleles ) {
|
||||
if ( a.isNull() ) {
|
||||
for (Allele a : alleles) {
|
||||
if (a.isNull()) {
|
||||
hasNullAltAllele = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
int endLoc = loc.getStart() + refAllele.length();
|
||||
if( !hasNullAltAllele )
|
||||
if (!hasNullAltAllele)
|
||||
endLoc--;
|
||||
|
||||
return endLoc;
|
||||
}
|
||||
|
||||
public static HashMap<PileupElement,LinkedHashMap<Allele,Double>> getIndelLikelihoodMap() {
|
||||
public static HashMap<PileupElement, LinkedHashMap<Allele, Double>> getIndelLikelihoodMap() {
|
||||
return indelLikelihoodMap.get();
|
||||
}
|
||||
|
||||
|
|
@ -434,8 +446,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
// so that per-sample DP will include deletions covering the event.
|
||||
protected int getFilteredDepth(ReadBackedPileup pileup) {
|
||||
int count = 0;
|
||||
for ( PileupElement p : pileup ) {
|
||||
if (p.isDeletion() || BaseUtils.isRegularBase(p.getBase()) )
|
||||
for (PileupElement p : pileup) {
|
||||
if (p.isDeletion() || p.isInsertionAtBeginningOfRead() || BaseUtils.isRegularBase(p.getBase()))
|
||||
count++;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -30,10 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
|
|
@ -42,34 +39,38 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
|||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
|
||||
|
||||
private static final int MIN_QUAL_SUM_FOR_ALT_ALLELE = 50;
|
||||
|
||||
private boolean ALLOW_MULTIPLE_ALLELES;
|
||||
|
||||
private final boolean useAlleleFromVCF;
|
||||
|
||||
private final double[] likelihoodSums = new double[4];
|
||||
|
||||
protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||
super(UAC, logger);
|
||||
ALLOW_MULTIPLE_ALLELES = UAC.MULTI_ALLELIC;
|
||||
useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
|
||||
|
||||
// make sure the PL cache has been initialized with enough alleles
|
||||
if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null || UnifiedGenotyperEngine.PLIndexToAlleleIndex.length < 4 ) // +1 for 0 alt alleles
|
||||
UnifiedGenotyperEngine.calculatePLcache(3);
|
||||
}
|
||||
|
||||
public VariantContext getLikelihoods(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
AlignmentContextUtils.ReadOrientation contextType,
|
||||
GenotypePriors priors,
|
||||
Allele alternateAlleleToUse,
|
||||
boolean useBAQedPileup) {
|
||||
public VariantContext getLikelihoods(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final Map<String, AlignmentContext> contexts,
|
||||
final AlignmentContextUtils.ReadOrientation contextType,
|
||||
final GenotypePriors priors,
|
||||
final List<Allele> alternateAllelesToUse,
|
||||
final boolean useBAQedPileup,
|
||||
final GenomeLocParser locParser) {
|
||||
|
||||
if ( !(priors instanceof DiploidSNPGenotypePriors) )
|
||||
throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model");
|
||||
|
||||
final boolean[] basesToUse = new boolean[4];
|
||||
final byte refBase = ref.getBase();
|
||||
final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase);
|
||||
|
||||
|
|
@ -79,56 +80,8 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
alleles.add(Allele.create(refBase, true));
|
||||
final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles);
|
||||
|
||||
// find the alternate allele(s) that we should be using
|
||||
if ( alternateAlleleToUse != null ) {
|
||||
basesToUse[BaseUtils.simpleBaseToBaseIndex(alternateAlleleToUse.getBases()[0])] = true;
|
||||
} else if ( useAlleleFromVCF ) {
|
||||
final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles);
|
||||
|
||||
// ignore places where we don't have a SNP
|
||||
if ( vc == null || !vc.isSNP() )
|
||||
return null;
|
||||
|
||||
for ( Allele allele : vc.getAlternateAlleles() )
|
||||
basesToUse[BaseUtils.simpleBaseToBaseIndex(allele.getBases()[0])] = true;
|
||||
} else {
|
||||
|
||||
determineAlternateAlleles(basesToUse, refBase, contexts, useBAQedPileup);
|
||||
|
||||
// how many alternate alleles are we using?
|
||||
int alleleCounter = Utils.countSetBits(basesToUse);
|
||||
|
||||
// if there are no non-ref alleles...
|
||||
if ( alleleCounter == 0 ) {
|
||||
// if we only want variants, then we don't need to calculate genotype likelihoods
|
||||
if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY )
|
||||
return builder.make();
|
||||
|
||||
// otherwise, choose any alternate allele (it doesn't really matter)
|
||||
basesToUse[indexOfRefBase == 0 ? 1 : 0] = true;
|
||||
}
|
||||
}
|
||||
|
||||
// create the alternate alleles and the allele ordering (the ordering is crucial for the GLs)
|
||||
final int numAltAlleles = Utils.countSetBits(basesToUse);
|
||||
final int[] alleleOrdering = new int[numAltAlleles + 1];
|
||||
alleleOrdering[0] = indexOfRefBase;
|
||||
int alleleOrderingIndex = 1;
|
||||
int numLikelihoods = 1;
|
||||
for ( int i = 0; i < 4; i++ ) {
|
||||
if ( i != indexOfRefBase && basesToUse[i] ) {
|
||||
alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(i), false));
|
||||
alleleOrdering[alleleOrderingIndex++] = i;
|
||||
numLikelihoods += alleleOrderingIndex;
|
||||
}
|
||||
}
|
||||
builder.alleles(alleles);
|
||||
|
||||
// create the genotypes; no-call everyone for now
|
||||
GenotypesContext genotypes = GenotypesContext.create();
|
||||
final List<Allele> noCall = new ArrayList<Allele>();
|
||||
noCall.add(Allele.NO_CALL);
|
||||
|
||||
// calculate the GLs
|
||||
ArrayList<SampleGenotypeData> GLs = new ArrayList<SampleGenotypeData>(contexts.size());
|
||||
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
|
||||
ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
|
||||
if ( useBAQedPileup )
|
||||
|
|
@ -137,10 +90,56 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
// create the GenotypeLikelihoods object
|
||||
final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods((DiploidSNPGenotypePriors)priors, UAC.PCR_error);
|
||||
final int nGoodBases = GL.add(pileup, true, true, UAC.MIN_BASE_QUALTY_SCORE);
|
||||
if ( nGoodBases == 0 )
|
||||
continue;
|
||||
if ( nGoodBases > 0 )
|
||||
GLs.add(new SampleGenotypeData(sample.getKey(), GL, getFilteredDepth(pileup)));
|
||||
}
|
||||
|
||||
final double[] allLikelihoods = GL.getLikelihoods();
|
||||
// find the alternate allele(s) that we should be using
|
||||
if ( alternateAllelesToUse != null ) {
|
||||
alleles.addAll(alternateAllelesToUse);
|
||||
} else if ( useAlleleFromVCF ) {
|
||||
final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles);
|
||||
|
||||
// ignore places where we don't have a SNP
|
||||
if ( vc == null || !vc.isSNP() )
|
||||
return null;
|
||||
|
||||
alleles.addAll(vc.getAlternateAlleles());
|
||||
} else {
|
||||
|
||||
alleles.addAll(determineAlternateAlleles(refBase, GLs));
|
||||
|
||||
// if there are no non-ref alleles...
|
||||
if ( alleles.size() == 1 ) {
|
||||
// if we only want variants, then we don't need to calculate genotype likelihoods
|
||||
if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY )
|
||||
return builder.make();
|
||||
|
||||
// otherwise, choose any alternate allele (it doesn't really matter)
|
||||
alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(indexOfRefBase == 0 ? 1 : 0)));
|
||||
}
|
||||
}
|
||||
|
||||
// create the alternate alleles and the allele ordering (the ordering is crucial for the GLs)
|
||||
final int numAlleles = alleles.size();
|
||||
final int numAltAlleles = numAlleles - 1;
|
||||
|
||||
final int[] alleleOrdering = new int[numAlleles];
|
||||
int alleleOrderingIndex = 0;
|
||||
int numLikelihoods = 0;
|
||||
for ( Allele allele : alleles ) {
|
||||
alleleOrdering[alleleOrderingIndex++] = BaseUtils.simpleBaseToBaseIndex(allele.getBases()[0]);
|
||||
numLikelihoods += alleleOrderingIndex;
|
||||
}
|
||||
builder.alleles(alleles);
|
||||
|
||||
// create the genotypes; no-call everyone for now
|
||||
final GenotypesContext genotypes = GenotypesContext.create();
|
||||
final List<Allele> noCall = new ArrayList<Allele>();
|
||||
noCall.add(Allele.NO_CALL);
|
||||
|
||||
for ( SampleGenotypeData sampleData : GLs ) {
|
||||
final double[] allLikelihoods = sampleData.GL.getLikelihoods();
|
||||
final double[] myLikelihoods = new double[numLikelihoods];
|
||||
|
||||
int myLikelihoodsIndex = 0;
|
||||
|
|
@ -151,62 +150,46 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
}
|
||||
|
||||
// normalize in log space so that max element is zero.
|
||||
GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(myLikelihoods, false, true));
|
||||
final GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(myLikelihoods, false, true));
|
||||
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup));
|
||||
final HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
attributes.put(VCFConstants.DEPTH_KEY, sampleData.depth);
|
||||
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods);
|
||||
genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false));
|
||||
genotypes.add(new Genotype(sampleData.name, noCall, Genotype.NO_LOG10_PERROR, null, attributes, false));
|
||||
}
|
||||
|
||||
return builder.genotypes(genotypes).make();
|
||||
}
|
||||
|
||||
// determines the alleles to use
|
||||
protected List<Allele> determineAlternateAlleles(final byte ref, final List<SampleGenotypeData> sampleDataList) {
|
||||
|
||||
// fills in the allelesToUse array
|
||||
protected void determineAlternateAlleles(boolean[] allelesToUse, byte ref, Map<String, AlignmentContext> contexts, boolean useBAQedPileup) {
|
||||
int[] qualCounts = new int[4];
|
||||
|
||||
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
|
||||
// calculate the sum of quality scores for each base
|
||||
ReadBackedPileup pileup = useBAQedPileup ? createBAQedPileup( sample.getValue().getBasePileup() ) : sample.getValue().getBasePileup();
|
||||
for ( PileupElement p : pileup ) {
|
||||
// ignore deletions
|
||||
if ( p.isDeletion() || (!p.isReducedRead() && p.getQual() < UAC.MIN_BASE_QUALTY_SCORE) )
|
||||
continue;
|
||||
|
||||
final int index = BaseUtils.simpleBaseToBaseIndex(p.getBase());
|
||||
if ( index >= 0 ) {
|
||||
qualCounts[index] += p.getQual();
|
||||
}
|
||||
final int baseIndexOfRef = BaseUtils.simpleBaseToBaseIndex(ref);
|
||||
final int PLindexOfRef = DiploidGenotype.createDiploidGenotype(ref, ref).ordinal();
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
likelihoodSums[i] = 0.0;
|
||||
|
||||
// based on the GLs, find the alternate alleles with enough probability
|
||||
for ( SampleGenotypeData sampleData : sampleDataList ) {
|
||||
final double[] likelihoods = sampleData.GL.getLikelihoods();
|
||||
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
|
||||
if ( PLindexOfBestGL != PLindexOfRef ) {
|
||||
int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[3][PLindexOfBestGL];
|
||||
if ( alleles[0] != baseIndexOfRef )
|
||||
likelihoodSums[alleles[0]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
|
||||
// don't double-count it
|
||||
if ( alleles[1] != baseIndexOfRef && alleles[1] != alleles[0] )
|
||||
likelihoodSums[alleles[1]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
|
||||
}
|
||||
}
|
||||
|
||||
if ( ALLOW_MULTIPLE_ALLELES ) {
|
||||
for ( byte altAllele : BaseUtils.BASES ) {
|
||||
if ( altAllele == ref )
|
||||
continue;
|
||||
int index = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||
if ( qualCounts[index] >= MIN_QUAL_SUM_FOR_ALT_ALLELE ) {
|
||||
allelesToUse[index] = true;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// set the non-ref base which has the maximum quality score sum
|
||||
int maxCount = 0;
|
||||
int indexOfMax = 0;
|
||||
for ( byte altAllele : BaseUtils.BASES ) {
|
||||
if ( altAllele == ref )
|
||||
continue;
|
||||
int index = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||
if ( qualCounts[index] > maxCount ) {
|
||||
maxCount = qualCounts[index];
|
||||
indexOfMax = index;
|
||||
}
|
||||
}
|
||||
|
||||
if ( maxCount > 0 )
|
||||
allelesToUse[indexOfMax] = true;
|
||||
final List<Allele> allelesToUse = new ArrayList<Allele>(3);
|
||||
for ( int i = 0; i < 4; i++ ) {
|
||||
if ( likelihoodSums[i] > 0.0 )
|
||||
allelesToUse.add(Allele.create(BaseUtils.baseIndexToSimpleBase(i), false));
|
||||
}
|
||||
|
||||
return allelesToUse;
|
||||
}
|
||||
|
||||
public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) {
|
||||
|
|
@ -220,11 +203,23 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
|
||||
public class BAQedPileupElement extends PileupElement {
|
||||
public BAQedPileupElement( final PileupElement PE ) {
|
||||
super(PE.getRead(), PE.getOffset());
|
||||
super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeDeletion(), PE.isBeforeInsertion(), PE.isNextToSoftClip());
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); }
|
||||
}
|
||||
|
||||
}
|
||||
private static class SampleGenotypeData {
|
||||
|
||||
public final String name;
|
||||
public final DiploidSNPGenotypeLikelihoods GL;
|
||||
public final int depth;
|
||||
|
||||
public SampleGenotypeData(final String name, final DiploidSNPGenotypeLikelihoods GL, final int depth) {
|
||||
this.name = name;
|
||||
this.GL = GL;
|
||||
this.depth = depth;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -204,6 +204,6 @@ public class UGBoundAF extends RodWalker<VariantContext,Integer> {
|
|||
return Math.log10(s_2 + (s_2 - s)/15.0);
|
||||
}
|
||||
|
||||
return ExactAFCalculationModel.approximateLog10SumLog10(simpAux(likelihoods,a,c,eps/2,s_l,fa,fc,fd,cap-1),simpAux(likelihoods, c, b, eps / 2, s_r, fc, fb, fe, cap - 1));
|
||||
return MathUtils.approximateLog10SumLog10(simpAux(likelihoods,a,c,eps/2,s_l,fa,fc,fd,cap-1),simpAux(likelihoods, c, b, eps / 2, s_r, fc, fb, fe, cap - 1));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,114 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
/**
|
||||
* Uses the UG engine to determine per-sample genotype likelihoods and emits them as a VCF (using PLs).
|
||||
* Absolutely not supported or recommended for public use.
|
||||
* Run this as you would the UnifiedGenotyper, except that you must additionally pass in a VCF bound to
|
||||
* the name 'allele' so we know which alternate allele to use at each site.
|
||||
*/
|
||||
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_INPUT)
|
||||
@Reference(window=@Window(start=-200,stop=200))
|
||||
@By(DataSource.READS)
|
||||
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250)
|
||||
public class UGCalcLikelihoods extends LocusWalker<VariantCallContext, Integer> implements TreeReducible<Integer> {
|
||||
|
||||
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
|
||||
|
||||
// control the output
|
||||
@Output(doc="File to which variants should be written",required=true)
|
||||
protected VCFWriter writer = null;
|
||||
|
||||
// the calculation arguments
|
||||
private UnifiedGenotyperEngine UG_engine = null;
|
||||
|
||||
// enable deletions in the pileup
|
||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
||||
// enable extended events for indels
|
||||
public boolean generateExtendedEvents() { return UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.SNP; }
|
||||
|
||||
public void initialize() {
|
||||
// get all of the unique sample names
|
||||
Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples);
|
||||
|
||||
// initialize the header
|
||||
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?"));
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)"));
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
|
||||
|
||||
writer.writeHeader(new VCFHeader(headerInfo, samples)) ;
|
||||
}
|
||||
|
||||
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
|
||||
VariantContext call = UG_engine.calculateLikelihoods(tracker, refContext, rawContext);
|
||||
return call == null ? null : new VariantCallContext(call, true);
|
||||
}
|
||||
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
public Integer treeReduce(Integer lhs, Integer rhs) {
|
||||
return lhs + rhs;
|
||||
}
|
||||
|
||||
public Integer reduce(VariantCallContext value, Integer sum) {
|
||||
if ( value == null )
|
||||
return sum;
|
||||
|
||||
try {
|
||||
writer.add(value);
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name");
|
||||
}
|
||||
|
||||
return sum + 1;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer sum) {
|
||||
logger.info(String.format("Visited bases: %d", sum));
|
||||
}
|
||||
}
|
||||
|
|
@ -1,152 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Uses the UG engine to call variants based off of VCFs annotated with GLs (or PLs).
|
||||
* Absolutely not supported or recommended for public use.
|
||||
* Run this as you would the UnifiedGenotyper, except that instead of '-I reads' it expects any number
|
||||
* of GL/PL-annotated VCFs bound to a name starting with 'variant'.
|
||||
*/
|
||||
public class UGCallVariants extends RodWalker<VariantCallContext, Integer> {
|
||||
|
||||
@ArgumentCollection
|
||||
private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
|
||||
|
||||
@Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true)
|
||||
public List<RodBinding<VariantContext>> variants;
|
||||
|
||||
// control the output
|
||||
@Output(doc="File to which variants should be written",required=true)
|
||||
protected VCFWriter writer = null;
|
||||
|
||||
// the calculation arguments
|
||||
private UnifiedGenotyperEngine UG_engine = null;
|
||||
|
||||
// variant track names
|
||||
private Set<String> trackNames = new HashSet<String>();
|
||||
|
||||
public void initialize() {
|
||||
|
||||
for ( RodBinding<VariantContext> rb : variants )
|
||||
trackNames.add(rb.getName());
|
||||
Set<String> samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), trackNames);
|
||||
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples);
|
||||
|
||||
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, -1, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed"));
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, -1, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed"));
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes"));
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality"));
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)"));
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
|
||||
if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING )
|
||||
headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality"));
|
||||
|
||||
// initialize the header
|
||||
writer.writeHeader(new VCFHeader(headerInfo, samples));
|
||||
}
|
||||
|
||||
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null )
|
||||
return null;
|
||||
|
||||
List<VariantContext> VCs = tracker.getValues(variants, context.getLocation());
|
||||
|
||||
VariantContext mergedVC = mergeVCsWithGLs(VCs);
|
||||
if ( mergedVC == null )
|
||||
return null;
|
||||
|
||||
return UG_engine.calculateGenotypes(tracker, ref, context, mergedVC);
|
||||
}
|
||||
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
public Integer reduce(VariantCallContext value, Integer sum) {
|
||||
if ( value == null )
|
||||
return sum;
|
||||
|
||||
try {
|
||||
VariantContextBuilder builder = new VariantContextBuilder(value);
|
||||
VariantContextUtils.calculateChromosomeCounts(builder, true);
|
||||
writer.add(builder.make());
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name");
|
||||
}
|
||||
|
||||
return sum + 1;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer result) {
|
||||
logger.info(String.format("Visited sites: %d", result));
|
||||
}
|
||||
|
||||
private static VariantContext mergeVCsWithGLs(List<VariantContext> VCs) {
|
||||
// we can't use the VCUtils classes because our VCs can all be no-calls
|
||||
if ( VCs.size() == 0 )
|
||||
return null;
|
||||
|
||||
VariantContext variantVC = null;
|
||||
GenotypesContext genotypes = GenotypesContext.create();
|
||||
for ( VariantContext vc : VCs ) {
|
||||
if ( variantVC == null && vc.isVariant() )
|
||||
variantVC = vc;
|
||||
genotypes.addAll(getGenotypesWithGLs(vc.getGenotypes()));
|
||||
}
|
||||
|
||||
if ( variantVC == null ) {
|
||||
VariantContext vc = VCs.get(0);
|
||||
throw new UserException("There is no ALT allele in any of the VCF records passed in at " + vc.getChr() + ":" + vc.getStart());
|
||||
}
|
||||
|
||||
return new VariantContextBuilder(variantVC).source("VCwithGLs").genotypes(genotypes).make();
|
||||
}
|
||||
|
||||
private static GenotypesContext getGenotypesWithGLs(GenotypesContext genotypes) {
|
||||
GenotypesContext genotypesWithGLs = GenotypesContext.create(genotypes.size());
|
||||
for ( final Genotype g : genotypes ) {
|
||||
if ( g.hasLikelihoods() && g.getLikelihoods().getAsVector() != null )
|
||||
genotypesWithGLs.add(g);
|
||||
}
|
||||
return genotypesWithGLs;
|
||||
}
|
||||
}
|
||||
|
|
@ -84,8 +84,8 @@ public class UnifiedArgumentCollection {
|
|||
/**
|
||||
* This argument is not enabled by default because it increases the runtime by an appreciable amount.
|
||||
*/
|
||||
@Argument(fullName = "computeSLOD", shortName = "sl", doc = "If provided, we will calculate the SLOD", required = false)
|
||||
public boolean COMPUTE_SLOD = false;
|
||||
@Argument(fullName = "noSLOD", shortName = "nosl", doc = "If provided, we will not calculate the SLOD", required = false)
|
||||
public boolean NO_SLOD = false;
|
||||
|
||||
/**
|
||||
* When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding
|
||||
|
|
@ -103,21 +103,12 @@ public class UnifiedArgumentCollection {
|
|||
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
|
||||
public Double MAX_DELETION_FRACTION = 0.05;
|
||||
|
||||
/**
|
||||
* The default behavior of the Unified Genotyper is to allow the genotyping of just one alternate allele in discovery mode; using this flag
|
||||
* will enable the discovery of multiple alternate alleles. Please note that this works for SNPs only and that it is still highly experimental.
|
||||
* For advanced users only.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "multiallelic", shortName = "multiallelic", doc = "Allow the discovery of multiple alleles (SNPs only)", required = false)
|
||||
public boolean MULTI_ALLELIC = false;
|
||||
|
||||
/**
|
||||
* If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES),
|
||||
* then this site will be skipped and a warning printed. Note that genotyping sites with many alternate alleles is both CPU and memory intensive.
|
||||
* then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive.
|
||||
*/
|
||||
@Argument(fullName = "max_alternate_alleles", shortName = "maxAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
|
||||
public int MAX_ALTERNATE_ALLELES = 5;
|
||||
public int MAX_ALTERNATE_ALLELES = 3;
|
||||
|
||||
// indel-related arguments
|
||||
/**
|
||||
|
|
@ -146,8 +137,8 @@ public class UnifiedArgumentCollection {
|
|||
public int INDEL_HAPLOTYPE_SIZE = 80;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "bandedIndel", shortName = "bandedIndel", doc = "Banded Indel likelihood computation", required = false)
|
||||
public boolean BANDED_INDEL_COMPUTATION = false;
|
||||
@Argument(fullName = "noBandedIndel", shortName = "noBandedIndel", doc = "Don't do Banded Indel likelihood computation", required = false)
|
||||
public boolean DONT_DO_BANDED_INDEL_COMPUTATION = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false)
|
||||
|
|
@ -168,7 +159,7 @@ public class UnifiedArgumentCollection {
|
|||
uac.PCR_error = PCR_error;
|
||||
uac.GenotypingMode = GenotypingMode;
|
||||
uac.OutputMode = OutputMode;
|
||||
uac.COMPUTE_SLOD = COMPUTE_SLOD;
|
||||
uac.NO_SLOD = NO_SLOD;
|
||||
uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING;
|
||||
uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING;
|
||||
uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE;
|
||||
|
|
@ -184,8 +175,7 @@ public class UnifiedArgumentCollection {
|
|||
|
||||
// todo- arguments to remove
|
||||
uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES;
|
||||
uac.BANDED_INDEL_COMPUTATION = BANDED_INDEL_COMPUTATION;
|
||||
uac.MULTI_ALLELIC = MULTI_ALLELIC;
|
||||
uac.DONT_DO_BANDED_INDEL_COMPUTATION = DONT_DO_BANDED_INDEL_COMPUTATION;
|
||||
return uac;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -126,10 +126,10 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
@ArgumentCollection
|
||||
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
|
||||
public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; }
|
||||
public RodBinding<VariantContext> getVariantRodBinding() { return null; }
|
||||
public RodBinding<VariantContext> getSnpEffRodBinding() { return null; }
|
||||
public List<RodBinding<VariantContext>> getCompRodBindings() { return Collections.emptyList(); }
|
||||
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }
|
||||
public boolean alwaysAppendDbsnpId() { return false; }
|
||||
|
||||
/**
|
||||
* A raw, unfiltered, highly specific callset in VCF format.
|
||||
|
|
@ -169,9 +169,11 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
private VariantAnnotatorEngine annotationEngine;
|
||||
|
||||
// enable deletions in the pileup
|
||||
@Override
|
||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
||||
// enable extended events for indels
|
||||
@Override
|
||||
public boolean generateExtendedEvents() {
|
||||
return (UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.SNP && UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES);
|
||||
}
|
||||
|
|
@ -205,6 +207,12 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
*
|
||||
**/
|
||||
public void initialize() {
|
||||
// warn the user for misusing EMIT_ALL_SITES
|
||||
if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES &&
|
||||
UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY &&
|
||||
UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.SNP )
|
||||
logger.warn("WARNING: note that the EMIT_ALL_SITES option is intended only for point mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by no means produce a comprehensive set of indels in DISCOVERY mode");
|
||||
|
||||
// get all of the unique sample names
|
||||
Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
|
||||
|
|
@ -232,7 +240,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions());
|
||||
|
||||
// annotation (INFO) fields from UnifiedGenotyper
|
||||
if ( UAC.COMPUTE_SLOD )
|
||||
if ( !UAC.NO_SLOD )
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias"));
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?"));
|
||||
|
||||
|
|
|
|||
|
|
@ -54,8 +54,9 @@ public class UnifiedGenotyperEngine {
|
|||
EMIT_VARIANTS_ONLY,
|
||||
/** produces calls at variant sites and confident reference sites */
|
||||
EMIT_ALL_CONFIDENT_SITES,
|
||||
/** produces calls at any callable site regardless of confidence; this argument is intended for point
|
||||
* mutations (SNPs) only and while some indel calls may be produced they are by no means comprehensive */
|
||||
/** produces calls at any callable site regardless of confidence; this argument is intended only for point
|
||||
* mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by
|
||||
* no means produce a comprehensive set of indels in DISCOVERY mode */
|
||||
EMIT_ALL_SITES
|
||||
}
|
||||
|
||||
|
|
@ -236,14 +237,14 @@ public class UnifiedGenotyperEngine {
|
|||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
// private method called by both UnifiedGenotyper and UGCalcLikelihoods entry points into the engine
|
||||
private VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, Map<String, AlignmentContext> stratifiedContexts, AlignmentContextUtils.ReadOrientation type, Allele alternateAlleleToUse, boolean useBAQedPileup, final GenotypeLikelihoodsCalculationModel.Model model) {
|
||||
private VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, Map<String, AlignmentContext> stratifiedContexts, AlignmentContextUtils.ReadOrientation type, List<Allele> alternateAllelesToUse, boolean useBAQedPileup, final GenotypeLikelihoodsCalculationModel.Model model) {
|
||||
|
||||
// initialize the data for this thread if that hasn't been done yet
|
||||
if ( glcm.get() == null ) {
|
||||
glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC));
|
||||
}
|
||||
|
||||
return glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine);
|
||||
return glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser);
|
||||
}
|
||||
|
||||
private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, AlignmentContext rawContext) {
|
||||
|
|
@ -252,7 +253,7 @@ public class UnifiedGenotyperEngine {
|
|||
VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles);
|
||||
if ( vcInput == null )
|
||||
return null;
|
||||
vc = new VariantContextBuilder(vcInput).source("UG_call").noID().referenceBaseForIndel(ref.getBase()).attributes(new HashMap<String, Object>()).filters(new HashSet<String>()).make();
|
||||
vc = new VariantContextBuilder("UG_call", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStart(), vcInput.getAlleles()).make();
|
||||
} else {
|
||||
// deal with bad/non-standard reference bases
|
||||
if ( !Allele.acceptableAlleleBases(new byte[]{ref.getBase()}) )
|
||||
|
|
@ -294,12 +295,6 @@ public class UnifiedGenotyperEngine {
|
|||
}
|
||||
AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get();
|
||||
|
||||
// don't try to genotype too many alternate alleles
|
||||
if ( vc.getAlternateAlleles().size() > UAC.MAX_ALTERNATE_ALLELES ) {
|
||||
logger.warn("the Unified Genotyper is currently set to genotype at most " + UAC.MAX_ALTERNATE_ALLELES + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + vc.getAlternateAlleles().size() + " alternate alleles; see the --max_alternate_alleles argument");
|
||||
return null;
|
||||
}
|
||||
|
||||
// estimate our confidence in a reference call and return
|
||||
if ( vc.getNSamples() == 0 ) {
|
||||
if ( limitedContext )
|
||||
|
|
@ -312,25 +307,32 @@ public class UnifiedGenotyperEngine {
|
|||
// 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position)
|
||||
clearAFarray(AFresult.log10AlleleFrequencyLikelihoods);
|
||||
clearAFarray(AFresult.log10AlleleFrequencyPosteriors);
|
||||
afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult);
|
||||
List<Allele> allelesUsedInGenotyping = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult);
|
||||
|
||||
// is the most likely frequency conformation AC=0 for all alternate alleles?
|
||||
boolean bestGuessIsRef = true;
|
||||
|
||||
// determine which alternate alleles have AF>0
|
||||
boolean[] altAllelesToUse = new boolean[vc.getAlternateAlleles().size()];
|
||||
final List<Allele> myAlleles = new ArrayList<Allele>(vc.getAlleles().size());
|
||||
myAlleles.add(vc.getReference());
|
||||
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
|
||||
int indexOfBestAC = MathUtils.maxElementIndex(AFresult.log10AlleleFrequencyPosteriors[i]);
|
||||
final Allele alternateAllele = vc.getAlternateAllele(i);
|
||||
final int indexOfAllele = allelesUsedInGenotyping.indexOf(alternateAllele);
|
||||
// the genotyping model may have stripped it out
|
||||
if ( indexOfAllele == -1 )
|
||||
continue;
|
||||
|
||||
int indexOfBestAC = MathUtils.maxElementIndex(AFresult.log10AlleleFrequencyPosteriors[indexOfAllele-1]);
|
||||
|
||||
// if the most likely AC is not 0, then this is a good alternate allele to use;
|
||||
// make sure to test against log10PosteriorOfAFzero since that no longer is an entry in the array
|
||||
if ( indexOfBestAC != 0 && AFresult.log10AlleleFrequencyPosteriors[i][indexOfBestAC] > AFresult.log10PosteriorOfAFzero ) {
|
||||
altAllelesToUse[i] = true;
|
||||
if ( indexOfBestAC != 0 && AFresult.log10AlleleFrequencyPosteriors[indexOfAllele-1][indexOfBestAC] > AFresult.log10PosteriorOfAFzero ) {
|
||||
myAlleles.add(alternateAllele);
|
||||
bestGuessIsRef = false;
|
||||
}
|
||||
// if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele
|
||||
else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
altAllelesToUse[i] = true;
|
||||
myAlleles.add(alternateAllele);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -366,20 +368,6 @@ public class UnifiedGenotyperEngine {
|
|||
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), true, 1.0 - PofF);
|
||||
}
|
||||
|
||||
// strip out any alleles that aren't going to be used in the VariantContext
|
||||
final List<Allele> myAlleles;
|
||||
if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) {
|
||||
myAlleles = new ArrayList<Allele>(vc.getAlleles().size());
|
||||
myAlleles.add(vc.getReference());
|
||||
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
|
||||
if ( altAllelesToUse[i] )
|
||||
myAlleles.add(vc.getAlternateAllele(i));
|
||||
}
|
||||
} else {
|
||||
// use all of the alleles if we are given them by the user
|
||||
myAlleles = vc.getAlleles();
|
||||
}
|
||||
|
||||
// start constructing the resulting VC
|
||||
final GenomeLoc loc = genomeLocParser.createGenomeLoc(vc);
|
||||
final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles);
|
||||
|
|
@ -393,7 +381,7 @@ public class UnifiedGenotyperEngine {
|
|||
}
|
||||
|
||||
// create the genotypes
|
||||
final GenotypesContext genotypes = assignGenotypes(vc, altAllelesToUse);
|
||||
final GenotypesContext genotypes = subsetAlleles(vc, myAlleles, true);
|
||||
|
||||
// print out stats if we have a writer
|
||||
if ( verboseWriter != null && !limitedContext )
|
||||
|
|
@ -406,33 +394,31 @@ public class UnifiedGenotyperEngine {
|
|||
if ( !limitedContext && rawContext.hasPileupBeenDownsampled() )
|
||||
attributes.put(VCFConstants.DOWNSAMPLED_KEY, true);
|
||||
|
||||
if ( UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef ) {
|
||||
if ( !UAC.NO_SLOD && !limitedContext && !bestGuessIsRef ) {
|
||||
//final boolean DEBUG_SLOD = false;
|
||||
|
||||
// the overall lod
|
||||
VariantContext vcOverall = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, vc.getAlternateAllele(0), false, model);
|
||||
clearAFarray(AFresult.log10AlleleFrequencyLikelihoods);
|
||||
clearAFarray(AFresult.log10AlleleFrequencyPosteriors);
|
||||
afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult);
|
||||
//double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0];
|
||||
double overallLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0);
|
||||
//if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
|
||||
|
||||
List<Allele> alternateAllelesToUse = builder.make().getAlternateAlleles();
|
||||
|
||||
// the forward lod
|
||||
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, vc.getAlternateAllele(0), false, model);
|
||||
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, alternateAllelesToUse, false, model);
|
||||
clearAFarray(AFresult.log10AlleleFrequencyLikelihoods);
|
||||
clearAFarray(AFresult.log10AlleleFrequencyPosteriors);
|
||||
afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult);
|
||||
afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult);
|
||||
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
|
||||
double forwardLog10PofNull = AFresult.log10PosteriorOfAFzero;
|
||||
double forwardLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0);
|
||||
//if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);
|
||||
|
||||
// the reverse lod
|
||||
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, vc.getAlternateAllele(0), false, model);
|
||||
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, alternateAllelesToUse, false, model);
|
||||
clearAFarray(AFresult.log10AlleleFrequencyLikelihoods);
|
||||
clearAFarray(AFresult.log10AlleleFrequencyPosteriors);
|
||||
afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult);
|
||||
afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult);
|
||||
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
|
||||
double reverseLog10PofNull = AFresult.log10PosteriorOfAFzero;
|
||||
double reverseLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0);
|
||||
|
|
@ -771,52 +757,69 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
/**
|
||||
* @param vc variant context with genotype likelihoods
|
||||
* @param allelesToUse bit vector describing which alternate alleles from the vc are okay to use
|
||||
* @return genotypes
|
||||
*/
|
||||
public static GenotypesContext assignGenotypes(final VariantContext vc,
|
||||
final boolean[] allelesToUse) {
|
||||
public static GenotypesContext assignGenotypes(final VariantContext vc) {
|
||||
return subsetAlleles(vc, vc.getAlleles(), true);
|
||||
}
|
||||
|
||||
// the no-called genotypes
|
||||
final GenotypesContext GLs = vc.getGenotypes();
|
||||
/**
|
||||
* @param vc variant context with genotype likelihoods
|
||||
* @param allelesToUse which alleles from the vc are okay to use; *** must be in the same relative order as those in the original VC ***
|
||||
* @param assignGenotypes true if we should change the genotypes based on the (subsetted) PLs
|
||||
* @return genotypes
|
||||
*/
|
||||
public static GenotypesContext subsetAlleles(final VariantContext vc,
|
||||
final List<Allele> allelesToUse,
|
||||
final boolean assignGenotypes) {
|
||||
|
||||
// the genotypes with PLs
|
||||
final GenotypesContext oldGTs = vc.getGenotypes();
|
||||
|
||||
// samples
|
||||
final List<String> sampleIndices = GLs.getSampleNamesOrderedByName();
|
||||
final List<String> sampleIndices = oldGTs.getSampleNamesOrderedByName();
|
||||
|
||||
// the new called genotypes to create
|
||||
final GenotypesContext calls = GenotypesContext.create();
|
||||
// the new genotypes to create
|
||||
final GenotypesContext newGTs = GenotypesContext.create();
|
||||
|
||||
// we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward
|
||||
final int numOriginalAltAlleles = allelesToUse.length;
|
||||
final List<Allele> newAlleles = new ArrayList<Allele>(numOriginalAltAlleles+1);
|
||||
newAlleles.add(vc.getReference());
|
||||
for ( int i = 0; i < numOriginalAltAlleles; i++ ) {
|
||||
if ( allelesToUse[i] )
|
||||
newAlleles.add(vc.getAlternateAllele(i));
|
||||
}
|
||||
final int numNewAltAlleles = newAlleles.size() - 1;
|
||||
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
|
||||
final int numNewAltAlleles = allelesToUse.size() - 1;
|
||||
|
||||
// which PLs should be carried forward?
|
||||
ArrayList<Integer> likelihoodIndexesToUse = null;
|
||||
|
||||
// an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles,
|
||||
// then we can keep the PLs as is; otherwise, we determine which ones to keep
|
||||
if ( numNewAltAlleles != numOriginalAltAlleles && numNewAltAlleles > 0 ) {
|
||||
likelihoodIndexesToUse = new ArrayList<Integer>(30);
|
||||
|
||||
// make sure that we've cached enough data
|
||||
if ( numOriginalAltAlleles > PLIndexToAlleleIndex.length - 1 )
|
||||
calculatePLcache(numOriginalAltAlleles);
|
||||
final int[][] PLcache = PLIndexToAlleleIndex[numOriginalAltAlleles];
|
||||
|
||||
final boolean[] altAlleleIndexToUse = new boolean[numOriginalAltAlleles];
|
||||
for ( int i = 0; i < numOriginalAltAlleles; i++ ) {
|
||||
if ( allelesToUse.contains(vc.getAlternateAllele(i)) )
|
||||
altAlleleIndexToUse[i] = true;
|
||||
}
|
||||
|
||||
for ( int PLindex = 0; PLindex < PLcache.length; PLindex++ ) {
|
||||
int[] alleles = PLcache[PLindex];
|
||||
final int[] alleles = PLcache[PLindex];
|
||||
// consider this entry only if both of the alleles are good
|
||||
if ( (alleles[0] == 0 || allelesToUse[alleles[0] - 1]) && (alleles[1] == 0 || allelesToUse[alleles[1] - 1]) )
|
||||
if ( (alleles[0] == 0 || altAlleleIndexToUse[alleles[0] - 1]) && (alleles[1] == 0 || altAlleleIndexToUse[alleles[1] - 1]) )
|
||||
likelihoodIndexesToUse.add(PLindex);
|
||||
}
|
||||
}
|
||||
|
||||
// create the new genotypes
|
||||
for ( int k = GLs.size() - 1; k >= 0; k-- ) {
|
||||
final String sample = sampleIndices.get(k);
|
||||
final Genotype g = GLs.get(sample);
|
||||
if ( !g.hasLikelihoods() )
|
||||
for ( int k = 0; k < oldGTs.size(); k++ ) {
|
||||
final Genotype g = oldGTs.get(sampleIndices.get(k));
|
||||
if ( !g.hasLikelihoods() ) {
|
||||
newGTs.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false));
|
||||
continue;
|
||||
}
|
||||
|
||||
// create the new likelihoods array from the alleles we are allowed to use
|
||||
final double[] originalLikelihoods = g.getLikelihoods().getAsVector();
|
||||
|
|
@ -833,29 +836,38 @@ public class UnifiedGenotyperEngine {
|
|||
newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true);
|
||||
}
|
||||
|
||||
// if there is no mass on the (new) likelihoods and we actually have alternate alleles, then just no-call the sample
|
||||
// if there is no mass on the (new) likelihoods, then just no-call the sample
|
||||
if ( MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) {
|
||||
calls.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false));
|
||||
continue;
|
||||
newGTs.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false));
|
||||
}
|
||||
else {
|
||||
Map<String, Object> attrs = new HashMap<String, Object>(g.getAttributes());
|
||||
if ( numNewAltAlleles == 0 )
|
||||
attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY);
|
||||
else
|
||||
attrs.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(newLikelihoods));
|
||||
|
||||
// find the genotype with maximum likelihoods
|
||||
int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods);
|
||||
int[] alleles = PLIndexToAlleleIndex[numNewAltAlleles][PLindex];
|
||||
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
myAlleles.add(newAlleles.get(alleles[0]));
|
||||
myAlleles.add(newAlleles.get(alleles[1]));
|
||||
|
||||
final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(PLindex, newLikelihoods);
|
||||
Map<String, Object> attrs = new HashMap<String, Object>(g.getAttributes());
|
||||
if ( numNewAltAlleles == 0 )
|
||||
attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY);
|
||||
else
|
||||
attrs.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(newLikelihoods));
|
||||
calls.add(new Genotype(sample, myAlleles, qual, null, attrs, false));
|
||||
// if we weren't asked to assign a genotype, then just no-call the sample
|
||||
if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL )
|
||||
newGTs.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, attrs, false));
|
||||
else
|
||||
newGTs.add(assignGenotype(g, newLikelihoods, allelesToUse, numNewAltAlleles, attrs));
|
||||
}
|
||||
}
|
||||
|
||||
return newGTs;
|
||||
}
|
||||
|
||||
protected static Genotype assignGenotype(final Genotype originalGT, final double[] newLikelihoods, final List<Allele> allelesToUse, final int numNewAltAlleles, final Map<String, Object> attrs) {
|
||||
// find the genotype with maximum likelihoods
|
||||
int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods);
|
||||
int[] alleles = PLIndexToAlleleIndex[numNewAltAlleles][PLindex];
|
||||
|
||||
return calls;
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
myAlleles.add(allelesToUse.get(alleles[0]));
|
||||
myAlleles.add(allelesToUse.get(alleles[1]));
|
||||
|
||||
final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(PLindex, newLikelihoods);
|
||||
return new Genotype(originalGT.getSampleName(), myAlleles, qual, null, attrs, false);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -454,8 +454,7 @@ public class HaplotypeIndelErrorModel {
|
|||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+x0^x2)-log10(2)
|
||||
// First term is approximated by Jacobian log with table lookup.
|
||||
// Second term is a constant added to both likelihoods so will be ignored
|
||||
haplotypeLikehoodMatrix[i][j] += MathUtils.softMax(readLikelihood[0],
|
||||
readLikelihood[1]);
|
||||
haplotypeLikehoodMatrix[i][j] += MathUtils.approximateLog10SumLog10(readLikelihood[0], readLikelihood[1]);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -166,18 +166,17 @@ public class PairHMMIndelErrorModel {
|
|||
|
||||
final double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual];
|
||||
|
||||
matchMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[im1][jm1] + pBaseRead, XMetricArray[im1][jm1] + pBaseRead,
|
||||
YMetricArray[im1][jm1] + pBaseRead);
|
||||
matchMetricArray[indI][indJ] = pBaseRead + MathUtils.approximateLog10SumLog10(new double[]{matchMetricArray[im1][jm1], XMetricArray[im1][jm1], YMetricArray[im1][jm1]});
|
||||
|
||||
final double c1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1];
|
||||
final double d1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1];
|
||||
|
||||
XMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[im1][indJ] + c1, XMetricArray[im1][indJ] + d1);
|
||||
XMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10(matchMetricArray[im1][indJ] + c1, XMetricArray[im1][indJ] + d1);
|
||||
|
||||
// update Y array
|
||||
final double c2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1];
|
||||
final double d2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1];
|
||||
YMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[indI][jm1] + c2, YMetricArray[indI][jm1] + d2);
|
||||
YMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10(matchMetricArray[indI][jm1] + c2, YMetricArray[indI][jm1] + d2);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -316,9 +315,7 @@ public class PairHMMIndelErrorModel {
|
|||
|
||||
|
||||
final int bestI = X_METRIC_LENGTH - 1, bestJ = Y_METRIC_LENGTH - 1;
|
||||
final double bestMetric = MathUtils.softMax(matchMetricArray[bestI][bestJ],
|
||||
XMetricArray[bestI][bestJ],
|
||||
YMetricArray[bestI][bestJ]);
|
||||
final double bestMetric = MathUtils.approximateLog10SumLog10(new double[]{ matchMetricArray[bestI][bestJ], XMetricArray[bestI][bestJ], YMetricArray[bestI][bestJ] });
|
||||
|
||||
/*
|
||||
if (DEBUG) {
|
||||
|
|
@ -651,7 +648,7 @@ public class PairHMMIndelErrorModel {
|
|||
private final static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
|
||||
final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];
|
||||
|
||||
// todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplied to just a single loop without the intermediate NxN matrix
|
||||
// todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplified to just a single loop without the intermediate NxN matrix
|
||||
for (int i=0; i < numHaplotypes; i++) {
|
||||
for (int j=i; j < numHaplotypes; j++){
|
||||
// combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j]
|
||||
|
|
@ -665,7 +662,7 @@ public class PairHMMIndelErrorModel {
|
|||
final double li = readLikelihoods[readIdx][i];
|
||||
final double lj = readLikelihoods[readIdx][j];
|
||||
final int readCount = readCounts[readIdx];
|
||||
haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.softMax(li, lj) + LOG_ONE_HALF);
|
||||
haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.approximateLog10SumLog10(li, lj) + LOG_ONE_HALF);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -678,7 +675,7 @@ public class PairHMMIndelErrorModel {
|
|||
}
|
||||
}
|
||||
|
||||
// renormalize so that max element is zero.
|
||||
// renormalize so that max element is zero.
|
||||
return MathUtils.normalizeFromLog10(genotypeLikelihoods, false, true);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,6 +26,10 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.indels;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.apache.commons.jexl2.Expression;
|
||||
import org.apache.commons.jexl2.JexlContext;
|
||||
import org.apache.commons.jexl2.JexlEngine;
|
||||
import org.apache.commons.jexl2.MapContext;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
|
|
@ -71,7 +75,7 @@ import java.util.*;
|
|||
* <p>
|
||||
* This is a simple, counts-and-cutoffs based tool for calling indels from aligned (preferrably MSA cleaned) sequencing
|
||||
* data. Supported output formats are: BED format, extended verbose output (tab separated), and VCF. The latter two outputs
|
||||
* include additional statistics such as mismtaches and base qualitites around the calls, read strandness (how many
|
||||
* include additional statistics such as mismatches and base qualitites around the calls, read strandness (how many
|
||||
* forward/reverse reads support ref and indel alleles) etc. It is highly recommended to use these additional
|
||||
* statistics to perform post-filtering of the calls as the tool is tuned for sensitivity (in other words it will
|
||||
* attempt to "call" anything remotely reasonable based only on read counts and will generate all the additional
|
||||
|
|
@ -88,6 +92,16 @@ import java.util.*;
|
|||
* bam tagging is not required in this case, and tags are completely ignored if still used: all input bams will be merged
|
||||
* on the fly and assumed to represent a single sample - this tool does not check for sample id in the read groups).
|
||||
*
|
||||
* Which (putative) calls will make it into the output file(s) is controlled by an expression/list of expressions passed with -filter
|
||||
* flag: if any of the expressions evaluate to TRUE, the site will be discarded. Otherwise the putative call and all the
|
||||
* associated statistics will be printed into the output. Expressions recognize the following variables(in paired-sample
|
||||
* somatic mode variables are prefixed with T_ and N_ for Tumor and Normal, e.g. N_COV and T_COV are defined instead of COV):
|
||||
* COV for coverage at the site, INDEL_F for fraction of reads supporting consensus indel at the site (wrt total coverage),
|
||||
* INDEL_CF for fraction of reads with consensus indel wrt all reads with an indel at the site, CONS_CNT for the count of
|
||||
* reads supporting the consensus indel at the site. Conventional arithmetic and logical operations are supported. For instance,
|
||||
* N_COV<4||T_COV<6||T_INDEL_F<0.3||T_INDEL_CF<0.7 instructs the tool to only output indel calls with at least 30% observed
|
||||
* allelic fraction and with consensus indel making at least 70% of all indel observations at the site, and only at the sites
|
||||
* where tumor coverage and normal coverage are at least 6 and 4, respectively.
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* Tumor and normal bam files (or single sample bam file(s) in --unpaired mode).
|
||||
|
|
@ -147,30 +161,44 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false)
|
||||
java.io.File bedOutput = null;
|
||||
|
||||
@Deprecated
|
||||
@Argument(fullName="minCoverage", shortName="minCoverage",
|
||||
doc="indel calls will be made only at sites with tumor coverage of minCoverage or more reads; "+
|
||||
"with --unpaired (single sample) option, this value is used for minimum sample coverage", required=false)
|
||||
"with --unpaired (single sample) option, this value is used for minimum sample coverage. "+
|
||||
"INSTEAD USE: T_COV<cutoff (or COV<cutoff in unpaired mode) in -filter expression (see -filter).",
|
||||
required=false)
|
||||
int minCoverage = 6;
|
||||
|
||||
@Deprecated
|
||||
@Argument(fullName="minNormalCoverage", shortName="minNormalCoverage",
|
||||
doc="used only in default (somatic) mode; normal sample must have at least minNormalCoverage "+
|
||||
"or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false)
|
||||
"or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored. "+
|
||||
"INSTEAD USE: N_COV<cutoff in -filter expression (see -filter).", required=false)
|
||||
int minNormalCoverage = 4;
|
||||
|
||||
@Deprecated
|
||||
@Argument(fullName="minFraction", shortName="minFraction",
|
||||
doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+
|
||||
" (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false)
|
||||
doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, "+
|
||||
"required for making a call"+
|
||||
" (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction). "+
|
||||
"INSTEAD USE: T_INDEL_F<cutoff (or INDEL_F<cutoff in unpaired mode) in -filter expression "+
|
||||
"(see -filter).", required=false)
|
||||
double minFraction = 0.3;
|
||||
|
||||
@Deprecated
|
||||
@Argument(fullName="minConsensusFraction", shortName="minConsensusFraction",
|
||||
doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt "+
|
||||
"all indel observations at the site exceeds this threshold", required=false)
|
||||
"all indel observations at the site exceeds this threshold. "+
|
||||
"INSTEAD USE: T_INDEL_CF<cutoff (or INDEL_CF<cutoff in unpaired mode) in -filter expression "+
|
||||
"(see -filter).", required=false)
|
||||
double minConsensusFraction = 0.7;
|
||||
|
||||
@Deprecated
|
||||
@Argument(fullName="minIndelCount", shortName="minCnt",
|
||||
doc="Minimum count of reads supporting consensus indel required for making the call. "+
|
||||
" This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+
|
||||
"(minIndelCount not met) will not pass.", required=false)
|
||||
"(minIndelCount not met) will not pass. INSTEAD USE: T_CONS_CNT<cutoff "+
|
||||
"(or CONS_CNT<cutoff in unpaired mode) in -filter expression (see -filter).", required=false)
|
||||
int minIndelCount = 0;
|
||||
|
||||
@Argument(fullName="refseq", shortName="refseq",
|
||||
|
|
@ -178,6 +206,13 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
"GENOMIC/UTR/INTRON/CODING and with the gene name", required=false)
|
||||
String RefseqFileName = null;
|
||||
|
||||
|
||||
@Argument(shortName="filter", doc="One or more logical expressions. If any of the expressions is TRUE, " +
|
||||
"putative indel will be discarded and nothing will be printed into the output (unless genotyping "+
|
||||
"at the specific position is explicitly requested, see -genotype). "+
|
||||
"Default: T_COV<6||N_COV<4||T_INDEL_F<0.3||T_INDEL_CF<0.7", required=false)
|
||||
public ArrayList<String> FILTER_EXPRESSIONS = new ArrayList<String>();
|
||||
|
||||
//@Argument(fullName="blacklistedLanes", shortName="BL",
|
||||
// doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+
|
||||
// "by this application, so they will not contribute indels to consider and will not be counted.", required=false)
|
||||
|
|
@ -221,7 +256,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
private Writer verboseWriter = null;
|
||||
|
||||
|
||||
private static String annGenomic = "GENOMIC";
|
||||
private static String annGenomic = "GENOMIC\t";
|
||||
private static String annIntron = "INTRON";
|
||||
private static String annUTR = "UTR";
|
||||
private static String annCoding = "CODING";
|
||||
|
|
@ -245,6 +280,32 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
private long lastGenotypedPosition = -1; // last position on the currentGenotypeInterval, for which a call was already printed;
|
||||
// can be 1 base before lastGenotyped start
|
||||
|
||||
private JexlEngine jexlEngine = new JexlEngine();
|
||||
private ArrayList<Expression> jexlExpressions = new ArrayList<Expression>();
|
||||
|
||||
// the following arrays store indel source-specific (normal/tumor) metric names
|
||||
// for fast access when populating JEXL expression contexts (see IndelPrecall.fillContext())
|
||||
private final static String[] normalMetricsCassette = new String[4];
|
||||
private final static String[] tumorMetricsCassette = new String[4];
|
||||
private final static String[] singleMetricsCassette = new String[4];
|
||||
private final static int C_COV=0;
|
||||
private final static int C_CONS_CNT=1;
|
||||
private final static int C_INDEL_F=2;
|
||||
private final static int C_INDEL_CF=3;
|
||||
static {
|
||||
normalMetricsCassette[C_COV] = "N_COV";
|
||||
tumorMetricsCassette[C_COV] = "T_COV";
|
||||
singleMetricsCassette[C_COV] = "COV";
|
||||
normalMetricsCassette[C_CONS_CNT] = "N_CONS_CNT";
|
||||
tumorMetricsCassette[C_CONS_CNT] = "T_CONS_CNT";
|
||||
singleMetricsCassette[C_CONS_CNT] = "CONS_CNT";
|
||||
normalMetricsCassette[C_INDEL_F] = "N_INDEL_F";
|
||||
tumorMetricsCassette[C_INDEL_F] = "T_INDEL_F";
|
||||
singleMetricsCassette[C_INDEL_F] = "INDEL_F";
|
||||
normalMetricsCassette[C_INDEL_CF] = "N_INDEL_CF";
|
||||
tumorMetricsCassette[C_INDEL_CF] = "T_INDEL_CF";
|
||||
singleMetricsCassette[C_INDEL_CF] = "INDEL_CF";
|
||||
}
|
||||
|
||||
// "/humgen/gsa-scr1/GATK_Data/refGene.sorted.txt"
|
||||
|
||||
|
|
@ -389,6 +450,24 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
|
||||
vcf_writer.writeHeader(new VCFHeader(getVCFHeaderInfo(), SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()))) ;
|
||||
refData = new ReferenceDataSource(getToolkit().getArguments().referenceFile);
|
||||
|
||||
// Now initialize JEXL expressions:
|
||||
if ( FILTER_EXPRESSIONS.size() == 0 ) {
|
||||
if ( call_unpaired ) {
|
||||
FILTER_EXPRESSIONS.add("COV<6||INDEL_F<0.3||INDEL_CF<0.7");
|
||||
} else {
|
||||
FILTER_EXPRESSIONS.add("T_COV<6||N_COV<4||T_INDEL_F<0.3||T_INDEL_CF<0.7");
|
||||
}
|
||||
}
|
||||
for ( String s : FILTER_EXPRESSIONS ) {
|
||||
try {
|
||||
Expression e = jexlEngine.createExpression(s);
|
||||
jexlExpressions.add(e);
|
||||
} catch (Exception e) {
|
||||
throw new UserException.BadArgumentValue("Filter expression", "Invalid expression used (" + s + "). Please see the JEXL docs for correct syntax.") ;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -661,14 +740,26 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
if ( normal_context.indelsAt(pos).size() == 0 && ! genotype ) continue;
|
||||
|
||||
IndelPrecall normalCall = new IndelPrecall(normal_context,pos,NQS_WIDTH);
|
||||
JexlContext jc = new MapContext();
|
||||
normalCall.fillContext(jc,singleMetricsCassette);
|
||||
boolean discard_event = false;
|
||||
|
||||
if ( normalCall.getCoverage() < minCoverage && ! genotype ) {
|
||||
if ( DEBUG ) {
|
||||
System.out.println("DEBUG>> Indel at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)");
|
||||
}
|
||||
continue; // low coverage
|
||||
for ( Expression e : jexlExpressions ) {
|
||||
if ( ((Boolean)e.evaluate(jc)).booleanValue() ) { discard_event=true; break; }
|
||||
}
|
||||
|
||||
if ( discard_event && ! genotype ) {
|
||||
normal_context.indelsAt(pos).clear();
|
||||
continue; //*
|
||||
}
|
||||
|
||||
// if ( normalCall.getCoverage() < minCoverage && ! genotype ) {
|
||||
// if ( DEBUG ) {
|
||||
// System.out.println("DEBUG>> Indel at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)");
|
||||
// }
|
||||
// continue; // low coverage
|
||||
// }
|
||||
|
||||
if ( DEBUG ) System.out.println("DEBUG>> "+(normalCall.getAllVariantCount() == 0?"No Indel":"Indel")+" at "+pos);
|
||||
|
||||
long left = Math.max( pos-NQS_WIDTH, normal_context.getStart() );
|
||||
|
|
@ -697,24 +788,16 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
|
||||
location = getToolkit().getGenomeLocParser().createGenomeLoc(location.getContig(), pos);
|
||||
|
||||
boolean haveCall = normalCall.isCall(); // cache the value
|
||||
|
||||
if ( haveCall || genotype) {
|
||||
if ( haveCall ) normalCallsMade++;
|
||||
printVCFLine(vcf_writer,normalCall);
|
||||
if ( bedWriter != null ) normalCall.printBedLine(bedWriter);
|
||||
if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall);
|
||||
lastGenotypedPosition = pos;
|
||||
}
|
||||
if ( ! discard_event ) normalCallsMade++;
|
||||
printVCFLine(vcf_writer,normalCall, discard_event);
|
||||
if ( bedWriter != null ) normalCall.printBedLine(bedWriter);
|
||||
if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall, discard_event);
|
||||
lastGenotypedPosition = pos;
|
||||
|
||||
normal_context.indelsAt(pos).clear();
|
||||
// we dealt with this indel; don't want to see it again
|
||||
// (we might otherwise in the case when 1) there is another indel that follows
|
||||
// within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel)
|
||||
|
||||
// for ( IndelVariant var : variants ) {
|
||||
// System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount());
|
||||
// }
|
||||
}
|
||||
|
||||
if ( DEBUG ) System.out.println("DEBUG>> Actual shift to " + move_to + " ("+adjustedPosition+")");
|
||||
|
|
@ -829,18 +912,32 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
IndelPrecall tumorCall = new IndelPrecall(tumor_context,pos,NQS_WIDTH);
|
||||
IndelPrecall normalCall = new IndelPrecall(normal_context,pos,NQS_WIDTH);
|
||||
|
||||
if ( tumorCall.getCoverage() < minCoverage && ! genotype ) {
|
||||
if ( DEBUG ) {
|
||||
System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in tumor="+tumorCall.getCoverage()+" (SKIPPED)");
|
||||
}
|
||||
continue; // low coverage
|
||||
JexlContext jc = new MapContext();
|
||||
tumorCall.fillContext(jc,tumorMetricsCassette);
|
||||
normalCall.fillContext(jc,normalMetricsCassette);
|
||||
boolean discard_event = false;
|
||||
|
||||
for ( Expression e : jexlExpressions ) {
|
||||
if ( ((Boolean)e.evaluate(jc)).booleanValue() ) { discard_event=true; break; }
|
||||
}
|
||||
if ( normalCall.getCoverage() < minNormalCoverage && ! genotype ) {
|
||||
if ( DEBUG ) {
|
||||
System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)");
|
||||
}
|
||||
continue; // low coverage
|
||||
|
||||
if ( discard_event && ! genotype ) {
|
||||
tumor_context.indelsAt(pos).clear();
|
||||
normal_context.indelsAt(pos).clear();
|
||||
continue; //*
|
||||
}
|
||||
// if ( tumorCall.getCoverage() < minCoverage && ! genotype ) {
|
||||
// if ( DEBUG ) {
|
||||
// System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in tumor="+tumorCall.getCoverage()+" (SKIPPED)");
|
||||
// }
|
||||
// continue; // low coverage
|
||||
// }
|
||||
// if ( normalCall.getCoverage() < minNormalCoverage && ! genotype ) {
|
||||
// if ( DEBUG ) {
|
||||
// System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)");
|
||||
// }
|
||||
// continue; // low coverage
|
||||
// }
|
||||
|
||||
if ( DEBUG ) {
|
||||
System.out.print("DEBUG>> "+(tumorCall.getAllVariantCount() == 0?"No Indel":"Indel")+" in tumor, ");
|
||||
|
|
@ -868,32 +965,24 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
|
||||
if ( right > tumor_context.getStop() ) right = tumor_context.getStop(); // if indel is too close to the end of the window but we need to emit anyway (force-shift), adjust right
|
||||
|
||||
// location = getToolkit().getGenomeLocParser().setStart(location,pos);
|
||||
// location = getToolkit().getGenomeLocParser().setStop(location,pos); // retrieve annotation data
|
||||
|
||||
location = getToolkit().getGenomeLocParser().createGenomeLoc(location.getContig(),pos); // retrieve annotation data
|
||||
|
||||
boolean haveCall = tumorCall.isCall(); // cache the value
|
||||
// boolean haveCall = tumorCall.isCall(); // cache the value
|
||||
|
||||
if ( haveCall || genotype ) {
|
||||
if ( haveCall ) tumorCallsMade++;
|
||||
if ( ! discard_event ) tumorCallsMade++;
|
||||
|
||||
printVCFLine(vcf_writer,normalCall,tumorCall);
|
||||
printVCFLine(vcf_writer,normalCall,tumorCall,discard_event);
|
||||
|
||||
if ( bedWriter != null ) tumorCall.printBedLine(bedWriter);
|
||||
if ( bedWriter != null ) tumorCall.printBedLine(bedWriter);
|
||||
|
||||
if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall, tumorCall, discard_event );
|
||||
lastGenotypedPosition = pos;
|
||||
|
||||
if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall, tumorCall );
|
||||
lastGenotypedPosition = pos;
|
||||
}
|
||||
tumor_context.indelsAt(pos).clear();
|
||||
normal_context.indelsAt(pos).clear();
|
||||
// we dealt with this indel; don't want to see it again
|
||||
// (we might otherwise in the case when 1) there is another indel that follows
|
||||
// within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel)
|
||||
|
||||
// for ( IndelVariant var : variants ) {
|
||||
// System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount());
|
||||
// }
|
||||
}
|
||||
|
||||
if ( DEBUG ) System.out.println("DEBUG>> Actual shift to " + move_to + " ("+adjustedPosition+")");
|
||||
|
|
@ -947,14 +1036,14 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
|
||||
}
|
||||
|
||||
public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall) {
|
||||
public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall, boolean discard_event) {
|
||||
RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(location));
|
||||
String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList));
|
||||
|
||||
StringBuilder fullRecord = new StringBuilder();
|
||||
fullRecord.append(makeFullRecord(normalCall));
|
||||
fullRecord.append(annotationString);
|
||||
if ( ! normalCall.isCall() && normalCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL");
|
||||
if ( discard_event && normalCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL");
|
||||
try {
|
||||
verboseWriter.write(fullRecord.toString());
|
||||
verboseWriter.write('\n');
|
||||
|
|
@ -965,7 +1054,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
}
|
||||
|
||||
|
||||
public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall, IndelPrecall tumorCall) {
|
||||
public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall, IndelPrecall tumorCall, boolean discard_event) {
|
||||
RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(location));
|
||||
String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList));
|
||||
|
||||
|
|
@ -1013,7 +1102,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
fullRecord.append('\t');
|
||||
fullRecord.append(annotationString);
|
||||
|
||||
if ( ! tumorCall.isCall() && tumorCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL");
|
||||
if ( discard_event && tumorCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL");
|
||||
|
||||
try {
|
||||
verboseWriter.write(fullRecord.toString());
|
||||
|
|
@ -1023,7 +1112,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
}
|
||||
}
|
||||
|
||||
public void printVCFLine(VCFWriter vcf, IndelPrecall call) {
|
||||
public void printVCFLine(VCFWriter vcf, IndelPrecall call, boolean discard_event) {
|
||||
|
||||
long start = call.getPosition()-1;
|
||||
// If the beginning of the chromosome is deleted (possible, however unlikely), it's unclear how to proceed.
|
||||
|
|
@ -1060,14 +1149,14 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
|
||||
Map<String,Object> attrs = call.makeStatsAttributes(null);
|
||||
|
||||
if ( call.isCall() ) // we made a call - put actual het genotype here:
|
||||
if ( ! discard_event ) // we made a call - put actual het genotype here:
|
||||
genotypes.add(new Genotype(sample,alleles,Genotype.NO_LOG10_PERROR,null,attrs,false));
|
||||
else // no call: genotype is ref/ref (but alleles still contain the alt if we observed anything at all)
|
||||
genotypes.add(new Genotype(sample, homref_alleles,Genotype.NO_LOG10_PERROR,null,attrs,false));
|
||||
|
||||
}
|
||||
Set<String> filters = null;
|
||||
if ( call.getVariant() != null && ! call.isCall() ) {
|
||||
if ( call.getVariant() != null && discard_event ) {
|
||||
filters = new HashSet<String>();
|
||||
filters.add("NoCall");
|
||||
}
|
||||
|
|
@ -1095,7 +1184,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
}
|
||||
}
|
||||
|
||||
public void printVCFLine(VCFWriter vcf, IndelPrecall nCall, IndelPrecall tCall) {
|
||||
public void printVCFLine(VCFWriter vcf, IndelPrecall nCall, IndelPrecall tCall, boolean discard_event) {
|
||||
|
||||
long start = tCall.getPosition()-1;
|
||||
long stop = start;
|
||||
|
|
@ -1112,7 +1201,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
Map<String,Object> attrs = new HashMap();
|
||||
|
||||
boolean isSomatic = false;
|
||||
if ( nCall.getCoverage() >= minNormalCoverage && nCall.getVariant() == null && tCall.getVariant() != null ) {
|
||||
if ( nCall.getVariant() == null && tCall.getVariant() != null ) {
|
||||
isSomatic = true;
|
||||
attrs.put(VCFConstants.SOMATIC_KEY,true);
|
||||
}
|
||||
|
|
@ -1155,7 +1244,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
}
|
||||
|
||||
Set<String> filters = null;
|
||||
if ( tCall.getVariant() != null && ! tCall.isCall() ) {
|
||||
if ( tCall.getVariant() != null && discard_event ) {
|
||||
filters = new HashSet<String>();
|
||||
filters.add("NoCall");
|
||||
}
|
||||
|
|
@ -1602,6 +1691,13 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
|
||||
public IndelVariant getVariant() { return consensus_indel; }
|
||||
|
||||
public void fillContext(JexlContext context,String[] cassette) {
|
||||
context.set(cassette[C_INDEL_F],((double)consensus_indel_count)/total_coverage);
|
||||
context.set(cassette[C_INDEL_CF],((double)consensus_indel_count/all_indel_count));
|
||||
context.set(cassette[C_COV],total_coverage);
|
||||
context.set(cassette[C_CONS_CNT],consensus_indel_count);
|
||||
}
|
||||
/*
|
||||
public boolean isCall() {
|
||||
boolean ret = ( consensus_indel_count >= minIndelCount &&
|
||||
(double)consensus_indel_count > minFraction * total_coverage &&
|
||||
|
|
@ -1610,10 +1706,11 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
" total_count="+all_indel_count+" cov="+total_coverage+
|
||||
" minConsensusF="+((double)consensus_indel_count)/all_indel_count+
|
||||
" minF="+((double)consensus_indel_count)/total_coverage);
|
||||
|
||||
return ret;
|
||||
|
||||
// return true;
|
||||
}
|
||||
|
||||
*/
|
||||
/** Utility method: finds the indel variant with the largest count (ie consensus) among all the observed
|
||||
* variants, and sets the counts of consensus observations and all observations of any indels (including non-consensus)
|
||||
* @param variants
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ public class AllelePair {
|
|||
|
||||
public AllelePair(Genotype gt) {
|
||||
if (gt.getPloidy() != 2)
|
||||
throw new ReviewedStingException("AllelePair must have ploidy of 2!");
|
||||
throw new ReviewedStingException("AllelePair must have ploidy of 2! incoming gt was"+gt.toBriefString());
|
||||
|
||||
this.top = gt.getAllele(0);
|
||||
this.bottom = gt.getAllele(1);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,146 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.qc;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RefWalker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Prints out counts of the number of reference ordered data objects encountered.
|
||||
*
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* One reference file only. And optionally -L intervals
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* If ok, nothing, else will throw an exception at the site where there's been a problem
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T QCRefWalker
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
public class QCRefWalker extends RefWalker<Integer, Integer> {
|
||||
@Output
|
||||
public PrintStream out;
|
||||
|
||||
String contigName = "";
|
||||
int contigStart, contigEnd;
|
||||
IndexedFastaSequenceFile uncachedRef;
|
||||
byte[] uncachedBases;
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
super.initialize(); //To change body of overridden methods use File | Settings | File Templates.
|
||||
uncachedRef = getToolkit().getReferenceDataSource().getReference();
|
||||
}
|
||||
|
||||
private final void throwError(ReferenceContext ref, String message) {
|
||||
throw new StingException(String.format("Site %s failed: %s", ref.getLocus(), message));
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
final String locusContigName = ref.getLocus().getContig();
|
||||
if ( ! locusContigName.equals(contigName) ) {
|
||||
contigName = locusContigName;
|
||||
ReferenceSequence refSeq = uncachedRef.getSequence(contigName);
|
||||
contigStart = 1;
|
||||
contigEnd = contigStart + refSeq.length() - 1;
|
||||
uncachedBases = uncachedRef.getSubsequenceAt(contigName, contigStart, contigEnd).getBases();
|
||||
logger.info(String.format("Loading contig %s (%d-%d)", contigName, contigStart, contigEnd));
|
||||
}
|
||||
|
||||
final byte refBase = ref.getBase();
|
||||
if (! ( BaseUtils.isRegularBase(refBase) || isExtendFastaBase(refBase) ) )
|
||||
throwError(ref, String.format("Refbase isn't a regular base (%d %c)", refBase, (char)refBase));
|
||||
|
||||
// check bases are equal
|
||||
final int pos = (int)context.getPosition() - contigStart;
|
||||
if ( pos > contigEnd )
|
||||
throwError(ref, String.format("off contig (len=%d)", contigEnd));
|
||||
final byte uncachedBase = uncachedBases[pos];
|
||||
|
||||
if ( uncachedBase != refBase )
|
||||
throwError(ref, String.format("Provided refBase (%d %c) not equal to uncached one (%d %c)",
|
||||
refBase, (char)refBase, uncachedBase, (char)uncachedBase));
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
private static final boolean isExtendFastaBase(final byte b) {
|
||||
switch ( b ) {
|
||||
case 'U':
|
||||
case 'R':
|
||||
case 'Y':
|
||||
case 'K':
|
||||
case 'M':
|
||||
case 'S':
|
||||
case 'W':
|
||||
case 'B':
|
||||
case 'D':
|
||||
case 'H':
|
||||
case 'V':
|
||||
case 'N':
|
||||
case 'X':
|
||||
case '-':
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public Integer reduce(Integer one, Integer sum) {
|
||||
return one + sum;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
/*
|
||||
* Copyright (c) 2011 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 9/26/11
|
||||
*/
|
||||
|
||||
public class ContextCovariate implements ExperimentalCovariate {
|
||||
|
||||
private int CONTEXT_SIZE;
|
||||
private String allN = "";
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
@Override
|
||||
public void initialize(final RecalibrationArgumentCollection RAC) {
|
||||
CONTEXT_SIZE = RAC.CONTEXT_SIZE;
|
||||
|
||||
if (CONTEXT_SIZE <= 0)
|
||||
throw new UserException("Context Size must be positive, if you don't want to use the context covariate, just turn it off instead");
|
||||
|
||||
// initialize allN given the size of the context
|
||||
for (int i = 0; i < CONTEXT_SIZE; i++)
|
||||
allN += "N";
|
||||
}
|
||||
|
||||
@Override
|
||||
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
|
||||
byte[] bases = read.getReadBases();
|
||||
for (int i = 0; i < read.getReadLength(); i++)
|
||||
comparable[i] = (i < CONTEXT_SIZE) ? allN : new String(Arrays.copyOfRange(bases, i - CONTEXT_SIZE, i));
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
@Override
|
||||
public final Comparable getValue(final String str) {
|
||||
return str;
|
||||
}
|
||||
}
|
||||
|
|
@ -41,6 +41,7 @@ import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
|||
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.io.PrintStream;
|
||||
|
|
@ -76,20 +77,20 @@ import java.util.Map;
|
|||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A recalibration table file in CSV format that is used by the TableRecalibration walker.
|
||||
* It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score.
|
||||
* It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score.
|
||||
*
|
||||
* The first 20 lines of such a file is shown below.
|
||||
* The first 20 lines of such a file is shown below.
|
||||
* * The file begins with a series of comment lines describing:
|
||||
* ** The number of counted loci
|
||||
* ** The number of counted bases
|
||||
* ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases
|
||||
*
|
||||
* * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records.
|
||||
*
|
||||
* * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records.
|
||||
*
|
||||
* * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change
|
||||
* depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of
|
||||
* depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of
|
||||
* reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
||||
*
|
||||
*
|
||||
* <pre>
|
||||
* # Counted Sites 19451059
|
||||
* # Counted Bases 56582018
|
||||
|
|
@ -128,13 +129,14 @@ import java.util.Map;
|
|||
* -cov DinucCovariate \
|
||||
* -recalFile my_reads.recal_data.csv
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
|
||||
@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN)
|
||||
@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file
|
||||
@ReadFilters( {MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class} ) // Filter out all reads with zero or unavailable mapping quality
|
||||
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta
|
||||
@By(DataSource.READS) // Only look at covered loci, not every loci of the reference file
|
||||
@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class})
|
||||
// Filter out all reads with zero or unavailable mapping quality
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
|
||||
// This walker requires both -I input.bam and -R reference.fasta
|
||||
@PartitionBy(PartitionType.LOCUS)
|
||||
public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.CountedData, CountCovariatesWalker.CountedData> implements TreeReducible<CountCovariatesWalker.CountedData> {
|
||||
|
||||
|
|
@ -148,16 +150,19 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
/////////////////////////////
|
||||
// Shared Arguments
|
||||
/////////////////////////////
|
||||
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
@ArgumentCollection
|
||||
private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
/**
|
||||
* This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference,
|
||||
* so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites.
|
||||
* so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites. This tool accepts any number of RodBindings (VCF, Bed, etc.)
|
||||
* for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites.
|
||||
* Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument.
|
||||
*/
|
||||
@Input(fullName="knownSites", shortName = "knownSites", doc="A database of known polymorphic sites to skip over in the recalibration algorithm", required=false)
|
||||
@Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false)
|
||||
public List<RodBinding<Feature>> knownSites = Collections.emptyList();
|
||||
|
||||
/**
|
||||
|
|
@ -166,31 +171,31 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
|
||||
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
||||
*/
|
||||
@Output(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the output covariates table recalibration file")
|
||||
@Output(fullName = "recal_file", shortName = "recalFile", required = true, doc = "Filename for the output covariates table recalibration file")
|
||||
@Gather(CountCovariatesGatherer.class)
|
||||
public PrintStream RECAL_FILE;
|
||||
|
||||
@Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false)
|
||||
@Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false)
|
||||
private boolean LIST_ONLY = false;
|
||||
|
||||
/**
|
||||
* See the -list argument to view available covariates.
|
||||
*/
|
||||
@Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required=false)
|
||||
@Argument(fullName = "covariate", shortName = "cov", doc = "Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required = false)
|
||||
private String[] COVARIATES = null;
|
||||
@Argument(fullName="standard_covs", shortName="standard", doc="Use the standard set of covariates in addition to the ones listed using the -cov argument", required=false)
|
||||
@Argument(fullName = "standard_covs", shortName = "standard", doc = "Use the standard set of covariates in addition to the ones listed using the -cov argument", required = false)
|
||||
private boolean USE_STANDARD_COVARIATES = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Debugging-only Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="dont_sort_output", shortName="unsorted", required=false, doc="If specified, the output table recalibration csv file will be in an unsorted, arbitrary order to save some run time.")
|
||||
@Argument(fullName = "dont_sort_output", shortName = "unsorted", required = false, doc = "If specified, the output table recalibration csv file will be in an unsorted, arbitrary order to save some run time.")
|
||||
private boolean DONT_SORT_OUTPUT = false;
|
||||
|
||||
/**
|
||||
* This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option.
|
||||
*/
|
||||
@Argument(fullName="run_without_dbsnp_potentially_ruining_quality", shortName="run_without_dbsnp_potentially_ruining_quality", required=false, doc="If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
|
||||
@Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
|
||||
private boolean RUN_WITHOUT_DBSNP = false;
|
||||
|
||||
/////////////////////////////
|
||||
|
|
@ -214,6 +219,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
|
||||
/**
|
||||
* Adds the values of other to this, returning this
|
||||
*
|
||||
* @param other
|
||||
* @return this object
|
||||
*/
|
||||
|
|
@ -244,53 +250,55 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
*/
|
||||
public void initialize() {
|
||||
|
||||
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; }
|
||||
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; }
|
||||
if (RAC.FORCE_PLATFORM != null) {
|
||||
RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
|
||||
}
|
||||
|
||||
// Get a list of all available covariates
|
||||
final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>( Covariate.class ).getPlugins();
|
||||
final List<Class<? extends RequiredCovariate>> requiredClasses = new PluginManager<RequiredCovariate>( RequiredCovariate.class ).getPlugins();
|
||||
final List<Class<? extends StandardCovariate>> standardClasses = new PluginManager<StandardCovariate>( StandardCovariate.class ).getPlugins();
|
||||
final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>(Covariate.class).getPlugins();
|
||||
final List<Class<? extends RequiredCovariate>> requiredClasses = new PluginManager<RequiredCovariate>(RequiredCovariate.class).getPlugins();
|
||||
final List<Class<? extends StandardCovariate>> standardClasses = new PluginManager<StandardCovariate>(StandardCovariate.class).getPlugins();
|
||||
|
||||
// Print and exit if that's what was requested
|
||||
if ( LIST_ONLY ) {
|
||||
logger.info( "Available covariates:" );
|
||||
for( Class<?> covClass : covariateClasses ) {
|
||||
logger.info( covClass.getSimpleName() );
|
||||
if (LIST_ONLY) {
|
||||
logger.info("Available covariates:");
|
||||
for (Class<?> covClass : covariateClasses) {
|
||||
logger.info(covClass.getSimpleName());
|
||||
}
|
||||
logger.info("");
|
||||
|
||||
System.exit( 0 ); // Early exit here because user requested it
|
||||
System.exit(0); // Early exit here because user requested it
|
||||
}
|
||||
|
||||
// Warn the user if no dbSNP file or other variant mask was specified
|
||||
if( knownSites.isEmpty() && !RUN_WITHOUT_DBSNP ) {
|
||||
if (knownSites.isEmpty() && !RUN_WITHOUT_DBSNP) {
|
||||
throw new UserException.CommandLineException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation.");
|
||||
}
|
||||
|
||||
// Initialize the requested covariates by parsing the -cov argument
|
||||
// First add the required covariates
|
||||
if( requiredClasses.size() == 2) { // readGroup and reported quality score
|
||||
requestedCovariates.add( new ReadGroupCovariate() ); // Order is important here
|
||||
requestedCovariates.add( new QualityScoreCovariate() );
|
||||
} else {
|
||||
if (requiredClasses.size() == 2) { // readGroup and reported quality score
|
||||
requestedCovariates.add(new ReadGroupCovariate()); // Order is important here
|
||||
requestedCovariates.add(new QualityScoreCovariate());
|
||||
}
|
||||
else {
|
||||
throw new UserException.CommandLineException("There are more required covariates than expected. The instantiation list needs to be updated with the new required covariate and in the correct order.");
|
||||
}
|
||||
// Next add the standard covariates if -standard was specified by the user
|
||||
if( USE_STANDARD_COVARIATES ) {
|
||||
if (USE_STANDARD_COVARIATES) {
|
||||
// We want the standard covariates to appear in a consistent order but the packageUtils method gives a random order
|
||||
// A list of Classes can't be sorted, but a list of Class names can be
|
||||
final List<String> standardClassNames = new ArrayList<String>();
|
||||
for( Class<?> covClass : standardClasses ) {
|
||||
standardClassNames.add( covClass.getName() );
|
||||
for (Class<?> covClass : standardClasses) {
|
||||
standardClassNames.add(covClass.getName());
|
||||
}
|
||||
Collections.sort(standardClassNames); // Sort the list of class names
|
||||
for( String className : standardClassNames ) {
|
||||
for( Class<?> covClass : standardClasses ) { // Find the class that matches this class name
|
||||
if( covClass.getName().equals( className ) ) {
|
||||
for (String className : standardClassNames) {
|
||||
for (Class<?> covClass : standardClasses) { // Find the class that matches this class name
|
||||
if (covClass.getName().equals(className)) {
|
||||
try {
|
||||
final Covariate covariate = (Covariate)covClass.newInstance();
|
||||
requestedCovariates.add( covariate );
|
||||
final Covariate covariate = (Covariate) covClass.newInstance();
|
||||
requestedCovariates.add(covariate);
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(covClass, e);
|
||||
}
|
||||
|
|
@ -299,17 +307,17 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
}
|
||||
}
|
||||
// Finally parse the -cov arguments that were provided, skipping over the ones already specified
|
||||
if( COVARIATES != null ) {
|
||||
for( String requestedCovariateString : COVARIATES ) {
|
||||
if (COVARIATES != null) {
|
||||
for (String requestedCovariateString : COVARIATES) {
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : covariateClasses ) {
|
||||
if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class
|
||||
for (Class<?> covClass : covariateClasses) {
|
||||
if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class
|
||||
foundClass = true;
|
||||
if( !requiredClasses.contains( covClass ) && (!USE_STANDARD_COVARIATES || !standardClasses.contains( covClass )) ) {
|
||||
if (!requiredClasses.contains(covClass) && (!USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) {
|
||||
try {
|
||||
// Now that we've found a matching class, try to instantiate it
|
||||
final Covariate covariate = (Covariate)covClass.newInstance();
|
||||
requestedCovariates.add( covariate );
|
||||
final Covariate covariate = (Covariate) covClass.newInstance();
|
||||
requestedCovariates.add(covariate);
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(covClass, e);
|
||||
}
|
||||
|
|
@ -317,20 +325,19 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
}
|
||||
}
|
||||
|
||||
if( !foundClass ) {
|
||||
throw new UserException.CommandLineException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." );
|
||||
if (!foundClass) {
|
||||
throw new UserException.CommandLineException("The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logger.info( "The covariates being used here: " );
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
logger.info( "\t" + cov.getClass().getSimpleName() );
|
||||
cov.initialize( RAC ); // Initialize any covariate member variables using the shared argument collection
|
||||
logger.info("The covariates being used here: ");
|
||||
for (Covariate cov : requestedCovariates) {
|
||||
logger.info("\t" + cov.getClass().getSimpleName());
|
||||
cov.initialize(RAC); // Initialize any covariate member variables using the shared argument collection
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
|
|
@ -339,63 +346,63 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
|
||||
/**
|
||||
* For each read at this locus get the various covariate values and increment that location in the map based on
|
||||
* whether or not the base matches the reference at this particular location
|
||||
* whether or not the base matches the reference at this particular location
|
||||
*
|
||||
* @param tracker The reference metadata tracker
|
||||
* @param ref The reference context
|
||||
* @param ref The reference context
|
||||
* @param context The alignment context
|
||||
* @return Returns 1, but this value isn't used in the reduce step
|
||||
*/
|
||||
public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
public CountedData map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
// Only use data from non-dbsnp sites
|
||||
// Assume every mismatch at a non-dbsnp site is indicative of poor quality
|
||||
CountedData counter = new CountedData();
|
||||
if( tracker.getValues(knownSites).size() == 0 ) { // If something here is in one of the knownSites tracks then skip over it, otherwise proceed
|
||||
if (tracker.getValues(knownSites).size() == 0) { // If something here is in one of the knownSites tracks then skip over it, otherwise proceed
|
||||
// For each read at this locus
|
||||
for( final PileupElement p : context.getBasePileup() ) {
|
||||
final GATKSAMRecord gatkRead = (GATKSAMRecord) p.getRead();
|
||||
for (final PileupElement p : context.getBasePileup()) {
|
||||
final GATKSAMRecord gatkRead = p.getRead();
|
||||
int offset = p.getOffset();
|
||||
|
||||
if( gatkRead.containsTemporaryAttribute( SKIP_RECORD_ATTRIBUTE ) ) {
|
||||
if (gatkRead.containsTemporaryAttribute(SKIP_RECORD_ATTRIBUTE)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if( !gatkRead.containsTemporaryAttribute( SEEN_ATTRIBUTE ) )
|
||||
{
|
||||
gatkRead.setTemporaryAttribute( SEEN_ATTRIBUTE, true );
|
||||
RecalDataManager.parseSAMRecord( gatkRead, RAC );
|
||||
if (!gatkRead.containsTemporaryAttribute(SEEN_ATTRIBUTE)) {
|
||||
gatkRead.setTemporaryAttribute(SEEN_ATTRIBUTE, true);
|
||||
RecalDataManager.parseSAMRecord(gatkRead, RAC);
|
||||
|
||||
// Skip over reads with no calls in the color space if the user requested it
|
||||
if( !(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) && RecalDataManager.checkNoCallColorSpace( gatkRead ) ) {
|
||||
gatkRead.setTemporaryAttribute( SKIP_RECORD_ATTRIBUTE, true);
|
||||
if (!(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) && RecalDataManager.checkNoCallColorSpace(gatkRead)) {
|
||||
gatkRead.setTemporaryAttribute(SKIP_RECORD_ATTRIBUTE, true);
|
||||
continue;
|
||||
}
|
||||
|
||||
RecalDataManager.parseColorSpace( gatkRead );
|
||||
gatkRead.setTemporaryAttribute( COVARS_ATTRIBUTE,
|
||||
RecalDataManager.computeCovariates( gatkRead, requestedCovariates ));
|
||||
RecalDataManager.parseColorSpace(gatkRead);
|
||||
gatkRead.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalDataManager.computeCovariates(gatkRead, requestedCovariates));
|
||||
}
|
||||
|
||||
|
||||
// Skip this position if base quality is zero
|
||||
if( gatkRead.getBaseQualities()[offset] > 0 ) {
|
||||
if (gatkRead.getBaseQualities()[offset] > 0) {
|
||||
|
||||
byte[] bases = gatkRead.getReadBases();
|
||||
byte refBase = ref.getBase();
|
||||
|
||||
// Skip if this base is an 'N' or etc.
|
||||
if( BaseUtils.isRegularBase( bases[offset] ) ) {
|
||||
if (BaseUtils.isRegularBase(bases[offset])) {
|
||||
|
||||
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
|
||||
if( !gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING ||
|
||||
!RecalDataManager.isInconsistentColorSpace( gatkRead, offset ) ) {
|
||||
if (!gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING ||
|
||||
!RecalDataManager.isInconsistentColorSpace(gatkRead, offset)) {
|
||||
|
||||
// This base finally passed all the checks for a good base, so add it to the big data hashmap
|
||||
updateDataFromRead( counter, gatkRead, offset, refBase );
|
||||
updateDataFromRead(counter, gatkRead, offset, refBase);
|
||||
|
||||
} else { // calculate SOLID reference insertion rate
|
||||
if( refBase == bases[offset] ) {
|
||||
}
|
||||
else { // calculate SOLID reference insertion rate
|
||||
if (refBase == bases[offset]) {
|
||||
counter.solidInsertedReferenceBases++;
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
counter.otherColorSpaceInconsistency++;
|
||||
}
|
||||
}
|
||||
|
|
@ -403,7 +410,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
}
|
||||
}
|
||||
counter.countedSites++;
|
||||
} else { // We skipped over the dbSNP site, and we are only processing every Nth locus
|
||||
}
|
||||
else { // We skipped over the dbSNP site, and we are only processing every Nth locus
|
||||
counter.skippedSites++;
|
||||
updateMismatchCounts(counter, context, ref.getBase()); // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
|
||||
}
|
||||
|
|
@ -411,7 +419,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
return counter;
|
||||
}
|
||||
|
||||
/**
|
||||
/**
|
||||
* Update the mismatch / total_base counts for a given class of loci.
|
||||
*
|
||||
* @param counter The CountedData to be updated
|
||||
|
|
@ -419,13 +427,13 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
* @param refBase The reference base
|
||||
*/
|
||||
private static void updateMismatchCounts(CountedData counter, final AlignmentContext context, final byte refBase) {
|
||||
for( PileupElement p : context.getBasePileup() ) {
|
||||
for (PileupElement p : context.getBasePileup()) {
|
||||
final byte readBase = p.getBase();
|
||||
final int readBaseIndex = BaseUtils.simpleBaseToBaseIndex(readBase);
|
||||
final int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(refBase);
|
||||
final int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(refBase);
|
||||
|
||||
if( readBaseIndex != -1 && refBaseIndex != -1 ) {
|
||||
if( readBaseIndex != refBaseIndex ) {
|
||||
if (readBaseIndex != -1 && refBaseIndex != -1) {
|
||||
if (readBaseIndex != refBaseIndex) {
|
||||
counter.novelCountsMM++;
|
||||
}
|
||||
counter.novelCountsBases++;
|
||||
|
|
@ -437,13 +445,14 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
* Major workhorse routine for this walker.
|
||||
* Loop through the list of requested covariates and pick out the value from the read, offset, and reference
|
||||
* Using the list of covariate values as a key, pick out the RecalDatum and increment,
|
||||
* adding one to the number of observations and potentially one to the number of mismatches
|
||||
* adding one to the number of observations and potentially one to the number of mismatches
|
||||
* Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls
|
||||
* because pulling things out of the SAMRecord is an expensive operation.
|
||||
* @param counter Data structure which holds the counted bases
|
||||
* because pulling things out of the SAMRecord is an expensive operation.
|
||||
*
|
||||
* @param counter Data structure which holds the counted bases
|
||||
* @param gatkRead The SAMRecord holding all the data for this read
|
||||
* @param offset The offset in the read for this locus
|
||||
* @param refBase The reference base at this locus
|
||||
* @param offset The offset in the read for this locus
|
||||
* @param refBase The reference base at this locus
|
||||
*/
|
||||
private void updateDataFromRead(CountedData counter, final GATKSAMRecord gatkRead, final int offset, final byte refBase) {
|
||||
final Object[][] covars = (Comparable[][]) gatkRead.getTemporaryAttribute(COVARS_ATTRIBUTE);
|
||||
|
|
@ -451,10 +460,10 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
|
||||
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
|
||||
final NestedHashMap data = dataManager.data; //optimization - create local reference
|
||||
RecalDatumOptimized datum = (RecalDatumOptimized) data.get( key );
|
||||
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
|
||||
RecalDatumOptimized datum = (RecalDatumOptimized) data.get(key);
|
||||
if (datum == null) { // key doesn't exist yet in the map so make a new bucket and add it
|
||||
// initialized with zeros, will be incremented at end of method
|
||||
datum = (RecalDatumOptimized)data.put( new RecalDatumOptimized(), true, (Object[])key );
|
||||
datum = (RecalDatumOptimized) data.put(new RecalDatumOptimized(), true, (Object[]) key);
|
||||
}
|
||||
|
||||
// Need the bases to determine whether or not we have a mismatch
|
||||
|
|
@ -462,13 +471,12 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
final long curMismatches = datum.getNumMismatches();
|
||||
|
||||
// Add one to the number of observations and potentially one to the number of mismatches
|
||||
datum.incrementBaseCounts( base, refBase );
|
||||
datum.incrementBaseCounts(base, refBase);
|
||||
counter.countedBases++;
|
||||
counter.novelCountsBases++;
|
||||
counter.novelCountsMM += datum.getNumMismatches() - curMismatches; // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
|
|
@ -477,6 +485,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
|
||||
/**
|
||||
* Initialize the reduce step by creating a PrintStream from the filename specified as an argument to the walker.
|
||||
*
|
||||
* @return returns A PrintStream created from the -recalFile filename argument specified to the walker
|
||||
*/
|
||||
public CountedData reduceInit() {
|
||||
|
|
@ -485,11 +494,12 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
|
||||
/**
|
||||
* The Reduce method doesn't do anything for this walker.
|
||||
*
|
||||
* @param mapped Result of the map. This value is immediately ignored.
|
||||
* @param sum The summing CountedData used to output the CSV data
|
||||
* @param sum The summing CountedData used to output the CSV data
|
||||
* @return returns The sum used to output the CSV data
|
||||
*/
|
||||
public CountedData reduce( CountedData mapped, CountedData sum ) {
|
||||
public CountedData reduce(CountedData mapped, CountedData sum) {
|
||||
// Do a dbSNP sanity check every so often
|
||||
return validatingDbsnpMismatchRate(sum.add(mapped));
|
||||
}
|
||||
|
|
@ -498,16 +508,15 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
* Validate the dbSNP reference mismatch rates.
|
||||
*/
|
||||
private CountedData validatingDbsnpMismatchRate(CountedData counter) {
|
||||
if( ++counter.lociSinceLastDbsnpCheck >= DBSNP_VALIDATION_CHECK_FREQUENCY ) {
|
||||
if (++counter.lociSinceLastDbsnpCheck >= DBSNP_VALIDATION_CHECK_FREQUENCY) {
|
||||
counter.lociSinceLastDbsnpCheck = 0;
|
||||
|
||||
if( counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L ) {
|
||||
final double fractionMM_novel = (double)counter.novelCountsMM / (double)counter.novelCountsBases;
|
||||
final double fractionMM_dbsnp = (double)counter.dbSNPCountsMM / (double)counter.dbSNPCountsBases;
|
||||
if (counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L) {
|
||||
final double fractionMM_novel = (double) counter.novelCountsMM / (double) counter.novelCountsBases;
|
||||
final double fractionMM_dbsnp = (double) counter.dbSNPCountsMM / (double) counter.dbSNPCountsBases;
|
||||
|
||||
if( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel ) {
|
||||
Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " +
|
||||
String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel) );
|
||||
if (fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel) {
|
||||
Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " + String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel));
|
||||
DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't annoyingly output the warning message every megabase of a large file
|
||||
}
|
||||
}
|
||||
|
|
@ -516,47 +525,50 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
return counter;
|
||||
}
|
||||
|
||||
public CountedData treeReduce( CountedData sum1, CountedData sum2 ) {
|
||||
public CountedData treeReduce(CountedData sum1, CountedData sum2) {
|
||||
return validatingDbsnpMismatchRate(sum1.add(sum2));
|
||||
}
|
||||
|
||||
/**
|
||||
* Write out the full data hashmap to disk in CSV format
|
||||
*
|
||||
* @param sum The CountedData to write out to RECAL_FILE
|
||||
*/
|
||||
public void onTraversalDone( CountedData sum ) {
|
||||
logger.info( "Writing raw recalibration data..." );
|
||||
if( sum.countedBases == 0L ) {
|
||||
public void onTraversalDone(CountedData sum) {
|
||||
logger.info("Writing raw recalibration data...");
|
||||
if (sum.countedBases == 0L) {
|
||||
throw new UserException.BadInput("Could not find any usable data in the input BAM file(s).");
|
||||
}
|
||||
outputToCSV( sum, RECAL_FILE );
|
||||
logger.info( "...done!" );
|
||||
outputToCSV(sum, RECAL_FILE);
|
||||
logger.info("...done!");
|
||||
}
|
||||
|
||||
/**
|
||||
* For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format
|
||||
*
|
||||
* @param recalTableStream The PrintStream to write out to
|
||||
*/
|
||||
private void outputToCSV( CountedData sum, final PrintStream recalTableStream ) {
|
||||
private void outputToCSV(CountedData sum, final PrintStream recalTableStream) {
|
||||
recalTableStream.printf("# Counted Sites %d%n", sum.countedSites);
|
||||
recalTableStream.printf("# Counted Bases %d%n", sum.countedBases);
|
||||
recalTableStream.printf("# Skipped Sites %d%n", sum.skippedSites);
|
||||
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)sum.countedSites / sum.skippedSites);
|
||||
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double) sum.countedSites / sum.skippedSites);
|
||||
|
||||
if( sum.solidInsertedReferenceBases != 0 ) {
|
||||
if (sum.solidInsertedReferenceBases != 0) {
|
||||
recalTableStream.printf("# Fraction SOLiD inserted reference 1 / %.0f bases%n", (double) sum.countedBases / sum.solidInsertedReferenceBases);
|
||||
recalTableStream.printf("# Fraction other color space inconsistencies 1 / %.0f bases%n", (double) sum.countedBases / sum.otherColorSpaceInconsistency);
|
||||
}
|
||||
|
||||
// Output header saying which covariates were used and in what order
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
recalTableStream.print( cov.getClass().getSimpleName().split("Covariate")[0] + "," );
|
||||
for (Covariate cov : requestedCovariates) {
|
||||
recalTableStream.print(cov.getClass().getSimpleName().split("Covariate")[0] + ",");
|
||||
}
|
||||
recalTableStream.println("nObservations,nMismatches,Qempirical");
|
||||
|
||||
if( DONT_SORT_OUTPUT ) {
|
||||
if (DONT_SORT_OUTPUT) {
|
||||
printMappings(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data);
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
printMappingsSorted(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data);
|
||||
}
|
||||
|
||||
|
|
@ -564,45 +576,47 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
recalTableStream.println(TableRecalibrationWalker.EOF_MARKER);
|
||||
}
|
||||
|
||||
private void printMappingsSorted( final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
|
||||
private void printMappingsSorted(final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
|
||||
final ArrayList<Comparable> keyList = new ArrayList<Comparable>();
|
||||
for( Object comp : data.keySet() ) {
|
||||
for (Object comp : data.keySet()) {
|
||||
keyList.add((Comparable) comp);
|
||||
}
|
||||
|
||||
Collections.sort(keyList);
|
||||
|
||||
for( Comparable comp : keyList ) {
|
||||
for (Comparable comp : keyList) {
|
||||
key[curPos] = comp;
|
||||
final Object val = data.get(comp);
|
||||
if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps
|
||||
if (val instanceof RecalDatumOptimized) { // We are at the end of the nested hash maps
|
||||
// For each Covariate in the key
|
||||
for( Object compToPrint : key ) {
|
||||
for (Object compToPrint : key) {
|
||||
// Output the Covariate's value
|
||||
recalTableStream.print( compToPrint + "," );
|
||||
recalTableStream.print(compToPrint + ",");
|
||||
}
|
||||
// Output the RecalDatum entry
|
||||
recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() );
|
||||
} else { // Another layer in the nested hash map
|
||||
printMappingsSorted( recalTableStream, curPos + 1, key, (Map) val );
|
||||
recalTableStream.println(((RecalDatumOptimized) val).outputToCSV());
|
||||
}
|
||||
else { // Another layer in the nested hash map
|
||||
printMappingsSorted(recalTableStream, curPos + 1, key, (Map) val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void printMappings( final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
|
||||
for( Object comp : data.keySet() ) {
|
||||
private void printMappings(final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
|
||||
for (Object comp : data.keySet()) {
|
||||
key[curPos] = comp;
|
||||
final Object val = data.get(comp);
|
||||
if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps
|
||||
if (val instanceof RecalDatumOptimized) { // We are at the end of the nested hash maps
|
||||
// For each Covariate in the key
|
||||
for( Object compToPrint : key ) {
|
||||
for (Object compToPrint : key) {
|
||||
// Output the Covariate's value
|
||||
recalTableStream.print( compToPrint + "," );
|
||||
recalTableStream.print(compToPrint + ",");
|
||||
}
|
||||
// Output the RecalDatum entry
|
||||
recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() );
|
||||
} else { // Another layer in the nested hash map
|
||||
printMappings( recalTableStream, curPos + 1, key, (Map) val );
|
||||
recalTableStream.println(((RecalDatumOptimized) val).outputToCSV());
|
||||
}
|
||||
else { // Another layer in the nested hash map
|
||||
printMappings(recalTableStream, curPos + 1, key, (Map) val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
|
|
@ -32,24 +33,24 @@ import net.sf.samtools.SAMRecord;
|
|||
* User: rpoplin
|
||||
* Date: Oct 30, 2009
|
||||
*
|
||||
* The Covariate interface. A Covariate is a feature used in the recalibration that can be picked out of the read, offset, and corresponding reference bases
|
||||
* The Covariate interface. A Covariate is a feature used in the recalibration that can be picked out of the read.
|
||||
* In general most error checking and adjustments to the data are done before the call to the covariates getValue methods in order to speed up the code.
|
||||
* This unfortunately muddies the code, but most of these corrections can be done per read while the covariates get called per base, resulting in a big speed up.
|
||||
*/
|
||||
|
||||
public interface Covariate {
|
||||
public void initialize( RecalibrationArgumentCollection RAC ); // Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public Comparable getValue( String str ); // Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public void getValues( SAMRecord read, Comparable[] comparable ); //Takes an array of size (at least) read.getReadLength() and fills it with covariate
|
||||
//values for each position in the read. This method was created as an optimization over calling getValue( read, offset ) for each offset and allows
|
||||
//read-specific calculations to be done just once rather than for each offset.
|
||||
public void initialize(RecalibrationArgumentCollection RAC); // Initialize any member variables using the command-line arguments passed to the walkers
|
||||
|
||||
public Comparable getValue(String str); // Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
|
||||
public void getValues(GATKSAMRecord read, Comparable[] comparable);
|
||||
//Takes an array of size (at least) read.getReadLength() and fills it with covariate
|
||||
//values for each position in the read. This method was created as an optimization over calling getValue( read, offset ) for each offset and allows
|
||||
//read-specific calculations to be done just once rather than for each offset.
|
||||
}
|
||||
|
||||
interface RequiredCovariate extends Covariate {
|
||||
}
|
||||
interface RequiredCovariate extends Covariate {}
|
||||
|
||||
interface StandardCovariate extends Covariate {
|
||||
}
|
||||
interface StandardCovariate extends Covariate {}
|
||||
|
||||
interface ExperimentalCovariate extends Covariate {
|
||||
}
|
||||
interface ExperimentalCovariate extends Covariate {}
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.NGSPlatform;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.EnumSet;
|
||||
|
|
@ -39,67 +39,69 @@ import java.util.EnumSet;
|
|||
* Date: Oct 30, 2009
|
||||
*
|
||||
* The Cycle covariate.
|
||||
* For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read)
|
||||
* For 454 the cycle is the TACG flow cycle, that is, each flow grabs all the TACG's in order in a single cycle
|
||||
* For example, for the read: AAACCCCGAAATTTTTACTG
|
||||
* the cycle would be 11111111222333333344
|
||||
* For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round
|
||||
* For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read)
|
||||
* For 454 the cycle is the TACG flow cycle, that is, each flow grabs all the TACG's in order in a single cycle
|
||||
* For example, for the read: AAACCCCGAAATTTTTACTG
|
||||
* the cycle would be 11111111222333333344
|
||||
* For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round
|
||||
*/
|
||||
|
||||
public class CycleCovariate implements StandardCovariate {
|
||||
private final static EnumSet<NGSPlatform> DISCRETE_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.ILLUMINA, NGSPlatform.SOLID, NGSPlatform.PACBIO, NGSPlatform.COMPLETE_GENOMICS);
|
||||
private final static EnumSet<NGSPlatform> FLOW_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.LS454, NGSPlatform.ION_TORRENT);
|
||||
private final static EnumSet<NGSPlatform> FLOW_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.LS454, NGSPlatform.ION_TORRENT);
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
if( RAC.DEFAULT_PLATFORM != null ) {
|
||||
if( RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "SLX" ) || RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "ILLUMINA" ) ||
|
||||
RAC.DEFAULT_PLATFORM.contains( "454" ) || RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "SOLID" ) || RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "ABI_SOLID" ) ) {
|
||||
@Override
|
||||
public void initialize(final RecalibrationArgumentCollection RAC) {
|
||||
if (RAC.DEFAULT_PLATFORM != null) {
|
||||
if (RAC.DEFAULT_PLATFORM.equalsIgnoreCase("SLX") || RAC.DEFAULT_PLATFORM.equalsIgnoreCase("ILLUMINA") ||
|
||||
RAC.DEFAULT_PLATFORM.contains("454") || RAC.DEFAULT_PLATFORM.equalsIgnoreCase("SOLID") || RAC.DEFAULT_PLATFORM.equalsIgnoreCase("ABI_SOLID")) {
|
||||
// nothing to do
|
||||
} else {
|
||||
throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM +") is not a recognized platform. Implemented options are illumina, 454, and solid");
|
||||
}
|
||||
else {
|
||||
throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM + ") is not a recognized platform. Implemented options are illumina, 454, and solid");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
public void getValues(SAMRecord read, Comparable[] comparable) {
|
||||
@Override
|
||||
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
|
||||
|
||||
//-----------------------------
|
||||
// Illumina, Solid, PacBio, and Complete Genomics
|
||||
//-----------------------------
|
||||
|
||||
final NGSPlatform ngsPlatform = ((GATKSAMRecord)read).getNGSPlatform();
|
||||
if( DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform) ) {
|
||||
final NGSPlatform ngsPlatform = read.getNGSPlatform();
|
||||
if (DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform)) {
|
||||
final int init;
|
||||
final int increment;
|
||||
if( !read.getReadNegativeStrandFlag() ) {
|
||||
if (!read.getReadNegativeStrandFlag()) {
|
||||
// Differentiate between first and second of pair.
|
||||
// The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group
|
||||
// to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair.
|
||||
// Therefore the cycle covariate must differentiate between first and second of pair reads.
|
||||
// This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because
|
||||
// the current sequential model would consider the effects independently instead of jointly.
|
||||
if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) {
|
||||
if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) {
|
||||
//second of pair, positive strand
|
||||
init = -1;
|
||||
increment = -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
else {
|
||||
//first of pair, positive strand
|
||||
init = 1;
|
||||
increment = 1;
|
||||
}
|
||||
|
||||
} else {
|
||||
if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) {
|
||||
}
|
||||
else {
|
||||
if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) {
|
||||
//second of pair, negative strand
|
||||
init = -read.getReadLength();
|
||||
increment = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
else {
|
||||
//first of pair, negative strand
|
||||
init = read.getReadLength();
|
||||
increment = -1;
|
||||
|
|
@ -107,7 +109,7 @@ public class CycleCovariate implements StandardCovariate {
|
|||
}
|
||||
|
||||
int cycle = init;
|
||||
for(int i = 0; i < read.getReadLength(); i++) {
|
||||
for (int i = 0; i < read.getReadLength(); i++) {
|
||||
comparable[i] = cycle;
|
||||
cycle += increment;
|
||||
}
|
||||
|
|
@ -116,7 +118,7 @@ public class CycleCovariate implements StandardCovariate {
|
|||
//-----------------------------
|
||||
// 454 and Ion Torrent
|
||||
//-----------------------------
|
||||
else if( FLOW_CYCLE_PLATFORMS.contains(ngsPlatform) ) {
|
||||
else if (FLOW_CYCLE_PLATFORMS.contains(ngsPlatform)) {
|
||||
|
||||
final int readLength = read.getReadLength();
|
||||
final byte[] bases = read.getReadBases();
|
||||
|
|
@ -133,38 +135,78 @@ public class CycleCovariate implements StandardCovariate {
|
|||
|
||||
// BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change
|
||||
// For example, AAAAAAA was probably read in two flow cycles but here we count it as one
|
||||
if( !read.getReadNegativeStrandFlag() ) { // Forward direction
|
||||
if (!read.getReadNegativeStrandFlag()) { // Forward direction
|
||||
int iii = 0;
|
||||
while( iii < readLength )
|
||||
{
|
||||
while( iii < readLength && bases[iii] == (byte)'T' ) { comparable[iii] = cycle; iii++; }
|
||||
while( iii < readLength && bases[iii] == (byte)'A' ) { comparable[iii] = cycle; iii++; }
|
||||
while( iii < readLength && bases[iii] == (byte)'C' ) { comparable[iii] = cycle; iii++; }
|
||||
while( iii < readLength && bases[iii] == (byte)'G' ) { comparable[iii] = cycle; iii++; }
|
||||
if( iii < readLength ) { if (multiplyByNegative1) cycle--; else cycle++; }
|
||||
if( iii < readLength && !BaseUtils.isRegularBase(bases[iii]) ) { comparable[iii] = cycle; iii++; }
|
||||
while (iii < readLength) {
|
||||
while (iii < readLength && bases[iii] == (byte) 'T') {
|
||||
comparable[iii] = cycle;
|
||||
iii++;
|
||||
}
|
||||
while (iii < readLength && bases[iii] == (byte) 'A') {
|
||||
comparable[iii] = cycle;
|
||||
iii++;
|
||||
}
|
||||
while (iii < readLength && bases[iii] == (byte) 'C') {
|
||||
comparable[iii] = cycle;
|
||||
iii++;
|
||||
}
|
||||
while (iii < readLength && bases[iii] == (byte) 'G') {
|
||||
comparable[iii] = cycle;
|
||||
iii++;
|
||||
}
|
||||
if (iii < readLength) {
|
||||
if (multiplyByNegative1)
|
||||
cycle--;
|
||||
else
|
||||
cycle++;
|
||||
}
|
||||
if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) {
|
||||
comparable[iii] = cycle;
|
||||
iii++;
|
||||
}
|
||||
|
||||
}
|
||||
} else { // Negative direction
|
||||
int iii = readLength-1;
|
||||
while( iii >= 0 )
|
||||
{
|
||||
while( iii >= 0 && bases[iii] == (byte)'T' ) { comparable[iii] = cycle; iii--; }
|
||||
while( iii >= 0 && bases[iii] == (byte)'A' ) { comparable[iii] = cycle; iii--; }
|
||||
while( iii >= 0 && bases[iii] == (byte)'C' ) { comparable[iii] = cycle; iii--; }
|
||||
while( iii >= 0 && bases[iii] == (byte)'G' ) { comparable[iii] = cycle; iii--; }
|
||||
if( iii >= 0 ) { if (multiplyByNegative1) cycle--; else cycle++; }
|
||||
if( iii >= 0 && !BaseUtils.isRegularBase(bases[iii]) ) { comparable[iii] = cycle; iii--; }
|
||||
}
|
||||
else { // Negative direction
|
||||
int iii = readLength - 1;
|
||||
while (iii >= 0) {
|
||||
while (iii >= 0 && bases[iii] == (byte) 'T') {
|
||||
comparable[iii] = cycle;
|
||||
iii--;
|
||||
}
|
||||
while (iii >= 0 && bases[iii] == (byte) 'A') {
|
||||
comparable[iii] = cycle;
|
||||
iii--;
|
||||
}
|
||||
while (iii >= 0 && bases[iii] == (byte) 'C') {
|
||||
comparable[iii] = cycle;
|
||||
iii--;
|
||||
}
|
||||
while (iii >= 0 && bases[iii] == (byte) 'G') {
|
||||
comparable[iii] = cycle;
|
||||
iii--;
|
||||
}
|
||||
if (iii >= 0) {
|
||||
if (multiplyByNegative1)
|
||||
cycle--;
|
||||
else
|
||||
cycle++;
|
||||
}
|
||||
if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) {
|
||||
comparable[iii] = cycle;
|
||||
iii--;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
throw new IllegalStateException("This method hasn't been implemented yet for " + read.getReadGroup().getPlatform());
|
||||
else {
|
||||
throw new UserException("The platform (" + read.getReadGroup().getPlatform() + ") associated with read group " + read.getReadGroup() + " is not a recognized platform. Implemented options are e.g. illumina, 454, and solid");
|
||||
}
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public final Comparable getValue( final String str ) {
|
||||
return Integer.parseInt( str );
|
||||
@Override
|
||||
public final Comparable getValue(final String str) {
|
||||
return Integer.parseInt(str);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,7 +1,8 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
|
|
@ -42,63 +43,30 @@ import java.util.HashMap;
|
|||
|
||||
public class DinucCovariate implements StandardCovariate {
|
||||
|
||||
private static final byte NO_CALL = (byte)'N';
|
||||
private static final byte NO_CALL = (byte) 'N';
|
||||
private static final Dinuc NO_DINUC = new Dinuc(NO_CALL, NO_CALL);
|
||||
|
||||
private HashMap<Integer, Dinuc> dinucHashMap;
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
final byte[] BASES = { (byte)'A', (byte)'C', (byte)'G', (byte)'T' };
|
||||
@Override
|
||||
public void initialize(final RecalibrationArgumentCollection RAC) {
|
||||
final byte[] BASES = {(byte) 'A', (byte) 'C', (byte) 'G', (byte) 'T'};
|
||||
dinucHashMap = new HashMap<Integer, Dinuc>();
|
||||
for( byte byte1 : BASES ) {
|
||||
for( byte byte2: BASES ) {
|
||||
dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) ); // This might seem silly, but Strings are too slow
|
||||
for (byte byte1 : BASES) {
|
||||
for (byte byte2 : BASES) {
|
||||
dinucHashMap.put(Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2)); // This might seem silly, but Strings are too slow
|
||||
}
|
||||
}
|
||||
// Add the "no dinuc" entry too
|
||||
dinucHashMap.put( Dinuc.hashBytes(NO_CALL, NO_CALL), NO_DINUC );
|
||||
dinucHashMap.put(Dinuc.hashBytes(NO_CALL, NO_CALL), NO_DINUC);
|
||||
}
|
||||
|
||||
/*
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
public final Comparable getValue( final SAMRecord read, final int offset ) {
|
||||
|
||||
byte base;
|
||||
byte prevBase;
|
||||
final byte[] bases = read.getReadBases();
|
||||
// If this is a negative strand read then we need to reverse the direction for our previous base
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
// No dinuc at the beginning of the read
|
||||
if( offset == bases.length-1 ) {
|
||||
return NO_DINUC;
|
||||
}
|
||||
base = (byte)BaseUtils.simpleComplement( (char)(bases[offset]) );
|
||||
// Note: We are using the previous base in the read, not the previous base in the reference. This is done in part to be consistent with unmapped reads.
|
||||
prevBase = (byte)BaseUtils.simpleComplement( (char)(bases[offset + 1]) );
|
||||
} else {
|
||||
// No dinuc at the beginning of the read
|
||||
if( offset == 0 ) {
|
||||
return NO_DINUC;
|
||||
}
|
||||
base = bases[offset];
|
||||
// Note: We are using the previous base in the read, not the previous base in the reference. This is done in part to be consistent with unmapped reads.
|
||||
prevBase = bases[offset - 1];
|
||||
}
|
||||
|
||||
// Make sure the previous base is good
|
||||
if( !BaseUtils.isRegularBase( prevBase ) ) {
|
||||
return NO_DINUC;
|
||||
}
|
||||
|
||||
return dinucHashMap.get( Dinuc.hashBytes( prevBase, base ) );
|
||||
}
|
||||
*/
|
||||
|
||||
/**
|
||||
* Takes an array of size (at least) read.getReadLength() and fills it with the covariate values for each position in the read.
|
||||
*/
|
||||
public void getValues( SAMRecord read, Comparable[] result ) {
|
||||
@Override
|
||||
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
|
||||
final HashMap<Integer, Dinuc> dinucHashMapRef = this.dinucHashMap; //optimize access to dinucHashMap
|
||||
final int readLength = read.getReadLength();
|
||||
final boolean negativeStrand = read.getReadNegativeStrandFlag();
|
||||
|
|
@ -108,50 +76,51 @@ public class DinucCovariate implements StandardCovariate {
|
|||
int offset = 0;
|
||||
// If this is a negative strand read then we need to reverse the direction for our previous base
|
||||
|
||||
if(negativeStrand) {
|
||||
if (negativeStrand) {
|
||||
bases = BaseUtils.simpleReverseComplement(bases); //this is NOT in-place
|
||||
}
|
||||
result[0] = NO_DINUC; // No dinuc at the beginning of the read
|
||||
comparable[0] = NO_DINUC; // No dinuc at the beginning of the read
|
||||
|
||||
prevBase = bases[0];
|
||||
offset++;
|
||||
while(offset < readLength) {
|
||||
// Note: We are using the previous base in the read, not the
|
||||
// previous base in the reference. This is done in part to be consistent with unmapped reads.
|
||||
base = bases[offset];
|
||||
if( BaseUtils.isRegularBase( prevBase ) ) {
|
||||
result[offset] = dinucHashMapRef.get( Dinuc.hashBytes( prevBase, base ) );
|
||||
} else {
|
||||
result[offset] = NO_DINUC;
|
||||
}
|
||||
while (offset < readLength) {
|
||||
// Note: We are using the previous base in the read, not the
|
||||
// previous base in the reference. This is done in part to be consistent with unmapped reads.
|
||||
base = bases[offset];
|
||||
if (BaseUtils.isRegularBase(prevBase)) {
|
||||
comparable[offset] = dinucHashMapRef.get(Dinuc.hashBytes(prevBase, base));
|
||||
}
|
||||
else {
|
||||
comparable[offset] = NO_DINUC;
|
||||
}
|
||||
|
||||
offset++;
|
||||
prevBase = base;
|
||||
offset++;
|
||||
prevBase = base;
|
||||
}
|
||||
if(negativeStrand) {
|
||||
reverse( result );
|
||||
if (negativeStrand) {
|
||||
reverse(comparable);
|
||||
}
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public final Comparable getValue( final String str ) {
|
||||
@Override
|
||||
public final Comparable getValue(final String str) {
|
||||
byte[] bytes = str.getBytes();
|
||||
final Dinuc returnDinuc = dinucHashMap.get( Dinuc.hashBytes( bytes[0], bytes[1] ) );
|
||||
if( returnDinuc.compareTo(NO_DINUC) == 0 ) {
|
||||
final Dinuc returnDinuc = dinucHashMap.get(Dinuc.hashBytes(bytes[0], bytes[1]));
|
||||
if (returnDinuc.compareTo(NO_DINUC) == 0) {
|
||||
return null;
|
||||
}
|
||||
return returnDinuc;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Reverses the given array in place.
|
||||
*
|
||||
* @param array
|
||||
* @param array any array
|
||||
*/
|
||||
private static void reverse(final Comparable[] array) {
|
||||
final int arrayLength = array.length;
|
||||
for(int l = 0, r = arrayLength - 1; l < r; l++, r--) {
|
||||
for (int l = 0, r = arrayLength - 1; l < r; l++, r--) {
|
||||
final Comparable temp = array[l];
|
||||
array[l] = array[r];
|
||||
array[r] = temp;
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue