Updated and near final version of tabular recalibration system. Uses 'yates' correction for low-occupancy quality bins. Faster and more robust handling of input and output

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1082 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-06-24 03:52:12 +00:00
parent caf5aef0f8
commit 0a50f2e160
5 changed files with 561 additions and 237 deletions

View File

@ -33,7 +33,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
//@Argument(fullName="MAX_READ_GROUPS", shortName="mrg", required=false, doc="Abort if number of read groups in input file exceeeds this count.")
//public int MAX_READ_GROUPS = 100;
@Argument(fullName="PLATFORM", shortName="pl", required=false, doc="Only calibrate read groups generated from the given platform (default = * for all platforms)")
public List<String> platforms = Collections.singletonList("*");
//public List<String> platforms = Collections.singletonList("ILLUMINA");
@ -50,10 +50,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
long counted_bases = 0; // number of bases used to count covariates
long skipped_sites = 0; // number of sites skipped because of a dbSNP entry
PrintStream recalTableOut = null;
public void initialize() {
//if( getToolkit().getEngine().getSAMHeader().getReadGroups().size() > MAX_READ_GROUPS )
// Utils.scareUser("Number of read groups in the specified file exceeds the number that can be processed in a reasonable amount of memory." +
// "To override this limit, use the --MAX_READ_GROUPS (-mrg) parameter");
try {
recalTableOut = new PrintStream( OUTPUT_FILEROOT+".recal_data.csv" );
} catch ( FileNotFoundException e ) {
throw new RuntimeException("Couldn't open output file", e);
}
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
if( readGroup.getAttribute("PL") == null )
@ -68,12 +72,31 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
out.printf("Created recalibration data collectors for %d read group(s)%n", data.size());
}
/**
* Get the particular RecalData datum associated with readGroup, at machine pos, with reported
* quality qual, and with the dinuc context of prevBase, base. If an example of such a
* base has been seen before, returns the associated RecalData. If not, it creates one, places it in the
* system so that subsequent requests will return that object, and returns it.
*
* @param readGroup
* @param pos
* @param qual
* @param prevBase
* @param base
* @return
*/
private RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) {
byte[] cs = {(byte)prevBase, (byte)base};
String s = new String(cs);
return data.get(readGroup).expandingGetRecalData(pos, qual, s, true);
}
/**
* Get a list of all of the RecalData associated with readGroup
*
* @param readGroup
* @return
*/
private List<RecalData> getRecalData(String readGroup) {
return data.get(readGroup).getAll();
}
@ -110,6 +133,20 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
return 1;
}
/**
* Updates the recalibration data for the base at offset in the read, associated with readGroup rg.
* Correctly handles machine orientation of the read. I.e., it adds data not by offset in the read
* but by implied machine cycle associated with the offset.
*
* TODO: this whole system is 0-based and therefore inconsisent with the rest of the GATK, where pos is 1-based
* TODO: and offset is 0-based. How very annoying.
*
* @param rg
* @param read
* @param offset
* @param ref
* @return
*/
private int updateDataFromRead( String rg, SAMRecord read, int offset, char ref ) {
int cycle = offset;
byte[] bases = read.getReadBases();
@ -126,13 +163,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
}
int qual = quals[offset];
if ( qual > 0 ) { // && qual <= QualityUtils.MAX_QUAL_SCORE ) {
// previous base is the next base in terms of machine chemistry if this is a negative strand
// if ( qual == 2 )
//if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") )
// System.out.printf("Adding neg?=%b b_offset=%c offset=%d cycle=%d qual=%d dinuc=%c%c ref_match=%c comp=%c name=%s%n",
// read.getReadNegativeStrandFlag(), (char)read.getReadBases()[offset], offset, cycle, qual, prevBase, base,
// ref, (char)BaseUtils.simpleComplement(ref), read.getReadName());
if ( qual > 0 ) {
RecalData datum = getRecalData(rg, cycle, qual, prevBase, base);
if (datum != null) datum.inc(base,ref);
return 1;
@ -153,6 +184,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
//out.printf("...done%n");
}
/**
* Prints some basic information about the CountCovariates run to the output stream out
* @param out
*/
private void printInfo(PrintStream out) {
out.printf("# date %s%n", new Date());
out.printf("# collapsed_pos %b%n", collapsePos);
@ -163,6 +198,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
out.printf("# fraction_skipped 1/%.0f%n", (double)counted_sites / skipped_sites);
}
@Deprecated
private void writeLogisticRecalibrationTable() {
PrintStream dinuc_out = null;
try {
@ -189,36 +225,28 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
}
}
private void writeRecalTable() {
PrintStream table_out = null;
try {
table_out = new PrintStream( OUTPUT_FILEROOT+".recal_data.csv");
printInfo(table_out);
table_out.println("rg,dn,Qrep,pos,NBases,MMismatches,Qemp");
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
if ( datum.N > 0 )
table_out.format("%s%n", datum.toCSVString(collapsePos));
}
}
} catch (FileNotFoundException e ) {
System.err.println("FileNotFoundException: " + e.getMessage());
}
finally {
if (table_out != null) table_out.close();
}
}
public Integer reduceInit() {
return 0;
}
public Integer reduce(Integer a, Integer b) {
return 0;
/**
* Writes out the key recalibration data collected from the reads. Dumps this recalibration data
* as a CVS string to the recalTableOut PrintStream. Emits the data for all read groups into this file.
*/
private void writeRecalTable() {
printInfo(recalTableOut);
recalTableOut.println("rg,pos,Qrep,dn,nBases,nMismatches,Qemp");
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
// TODO: should sort the data coming out of getRecalData here for easier processing
for ( RecalData datum: RecalData.sort(getRecalData(readGroup.getReadGroupId())) ) {
if ( datum.N > 0 )
recalTableOut.format("%s%n", datum.toCSVString(collapsePos));
}
}
recalTableOut.close();
}
/**
* Check to see whether this read group should be processed.
* Check to see whether this read group should be processed. Returns true if the
* read group is in the list of platforms to process or the platform == *, indicating
* that all platforms should be processed.
*
* @param readGroup
* @return
*/
@ -233,4 +261,24 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
return false;
}
/**
* No initialization routines
*
* @return
*/
public Integer reduceInit() {
return 0;
}
/**
* Doesn't do anything
*
* @param a
* @param b
* @return
*/
public Integer reduce(Integer a, Integer b) {
return 0;
}
}

View File

@ -3,14 +3,25 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.BaseUtils;
public class RecalData {
long N;
long B;
int pos;
int qual;
String readGroup;
String dinuc;
import java.util.List;
import java.util.Collections;
public class RecalData implements Comparable<RecalData> {
//
// context of this data -- read group, position in read, reported quality, and dinuc
//
String readGroup; // the read group where the datum was observed
int pos; // the position of this recalibration datum
int qual; // the reported quality scoree of this datum
String dinuc; // the dinucleotide context of this datum
//
// empirical quality score data
//
long N; // total number of observed bases
long B; // number of observed mismatches to the reference base
public RecalData(int pos, int qual, String readGroup, String dinuc ) {
this.pos = pos;
this.qual = qual;
@ -18,61 +29,164 @@ public class RecalData {
this.dinuc = dinuc;
}
public void inc(long incN, long incB) {
public RecalData( RecalData copy ) {
this.pos = copy.pos;
this.qual = copy.qual;
this.readGroup = copy.readGroup;
this.dinuc = copy.dinuc;
this.N = copy.N;
this.B = copy.B;
}
public int compareTo(RecalData to) {
int cmpReadGroup = readGroup.compareTo(to.readGroup);
if (cmpReadGroup != 0) return cmpReadGroup;
int cmpPos = new Integer(pos).compareTo(to.pos);
if (cmpPos != 0) return cmpPos;
int cmpQual = new Integer(qual).compareTo(to.qual);
if (cmpQual != 0) return cmpQual;
return dinuc.compareTo(to.dinuc);
}
/**
* Returns the expected number of mismatching bases for this datum, using the number of bases N and
* the reported qual score. Effectively qual implied error rate * number of bases.
*
* @return
*/
public double getExpectedErrors() {
return QualityUtils.qualToErrorProb((byte)qual) * N;
}
//
// Increment routines
//
public RecalData inc(long incN, long incB) {
N += incN;
B += incB;
return this;
}
//public int getDinucIndex() {
// return string2dinucIndex(this.dinuc);
//}
/**
* Add N and B from other to self
*
* @param other
* @return
*/
public RecalData inc(final RecalData other) {
return inc(other.N, other.B);
}
public void inc(char curBase, char ref) {
inc(1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1);
/**
* Add all of the Ns and Bs for data to self
*
* @param data
* @return
*/
public RecalData inc(List<RecalData> data) {
for ( RecalData other : data ) {
this.inc(other);
}
return this;
}
/**
* Count an empirical observation of our current base against the reference base.
* Increments N and, if curBase != ref, also increments B
*
* @param curBase
* @param ref
* @return
*/
public RecalData inc(char curBase, char ref) {
return inc(1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1);
//out.printf("%s %s\n", curBase, ref);
}
/**
* Get the integer dinuc index associated with this datum. Can return -1 if dinuc
* contains unexpected bases. Returns 0 if dinuc contains the * operator (not tracking dinucs)
* @return
*/
public int getDinucIndex() {
return dinucIndex(this.dinuc);
}
/**
* Calculates the empirical quality score for this datum from B and N. Bounds the result
* by QualityUtils.MAX_REASONABLE_Q_SCORE.
*
* @return
*/
public double empiricalQualDouble(final boolean useRawQempirical) {
double doubleB = useRawQempirical ? B : B + 1;
double doubleN = useRawQempirical ? N : N + 1;
double empiricalQual = -10 * Math.log10(doubleB / doubleN);
if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE;
return empiricalQual;
}
public double empiricalQualDouble() { return empiricalQualDouble(true); }
/**
* As as empiricalQualDouble, but rounded and encoded as a byte for placement in a read quality array of bytes
* @return
*/
public byte empiricalQualByte(final boolean useRawQempirical) {
double doubleB = useRawQempirical ? B : B + 1;
double doubleN = useRawQempirical ? N : N + 1;
return QualityUtils.probToQual(1.0 - doubleB / doubleN);
}
public byte empiricalQualByte() { return empiricalQualByte(true); }
public static String headerString() {
return ("pos, rg, dinuc, qual, emp_qual, qual_diff, n, b");
}
public double empiricalQualDouble() {
double empiricalQual = -10 * Math.log10((double)B / N);
if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE;
return empiricalQual;
}
public byte empiricalQualByte() {
return QualityUtils.probToQual(1.0 - (double)B / N);
}
public String toString() {
double empiricalQual = empiricalQualDouble();
return String.format("%3d,%s,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, readGroup, dinuc, qual, empiricalQual, qual-empiricalQual, N, B);
return String.format("[rg=%s, pos=%3d, Qrep=%3d, dinuc=%s, Qemp=%2.2f / %2.2f, B=%d, N=%d]", readGroup, pos, qual, dinuc, empiricalQualDouble(), empiricalQualDouble(false), B, N);
//return String.format("%3d,%s,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, readGroup, dinuc, qual, empiricalQual, qual-empiricalQual, N, B);
}
//
// To and from CSV strings
//
public String toCSVString(boolean collapsedPos) {
return String.format("%s,%s,%d,%s,%d,%d,%d", readGroup, dinuc, qual, collapsedPos ? "*" : pos, N, B, empiricalQualByte());
// rg,pos,Qrep,dn,nBases,nMismatches,Qemp
return String.format("%s,%s,%d,%s,%d,%d,%d", readGroup, collapsedPos ? "*" : pos, qual, dinuc, N, B, empiricalQualByte());
}
/**
* Parses the string s and returns the encoded RecalData in it. Assumes data has been emitted by
* toCSVString() routine. Order is readGroup, dinuc, qual, pos, N, B, Qemp as byte
* @param s
* @return
*/
public static RecalData fromCSVString(String s) {
String[] vals = s.split(",");
String rg = vals[0];
String dinuc = vals[1];
int pos = vals[1].equals("*") ? 0 : Integer.parseInt(vals[1]);
int qual = Integer.parseInt(vals[2]);
int pos = vals[3].equals("*") ? 0 : Integer.parseInt(vals[3]);
String dinuc = vals[3];
int N = Integer.parseInt(vals[4]);
int B = Integer.parseInt(vals[5]);
RecalData datum = new RecalData(pos, qual, rg, dinuc);
datum.B = B;
datum.N = N;
//if ( datum.N > 0 ) System.out.printf("Parsing line [%s] => [%s]%n", s, datum);
return datum;
}
public static List<RecalData> sort(List<RecalData> l) {
Collections.sort(l);
return l;
}
public static int bases2dinucIndex(char prevBase, char base) {
int pbI = BaseUtils.simpleBaseToBaseIndex(prevBase);
int bI = BaseUtils.simpleBaseToBaseIndex(base);
@ -93,25 +207,17 @@ public class RecalData {
return bases2dinucIndex((char)prevBase, (char)base);
}
//
// private static int nuc2num[];
// private static char num2nuc[];
//
// static {
// nuc2num = new int[128];
// nuc2num['A'] = 0;
// nuc2num['C'] = 1;
// nuc2num['G'] = 2;
// nuc2num['T'] = 3;
// nuc2num['a'] = 0;
// nuc2num['c'] = 1;
// nuc2num['g'] = 2;
// nuc2num['t'] = 3;
//
// num2nuc = new char[4];
// num2nuc[0] = 'A';
// num2nuc[1] = 'C';
// num2nuc[2] = 'G';
// num2nuc[3] = 'T';
// }
public static double combinedQreported(List<RecalData> l) {
double sumExpectedErrors = 0;
long nBases = 0;
for ( RecalData datum : l ) {
sumExpectedErrors += datum.getExpectedErrors();
nBases += datum.N;
}
double q = QualityUtils.phredScaleErrorRate(sumExpectedErrors / nBases);
System.out.printf("expected errors=%f, nBases = %d, rate=%f, qual=%f%n",
sumExpectedErrors, nBases, 1 - sumExpectedErrors / nBases, q);
return q;
}
}

View File

@ -1,6 +1,5 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.ExpandingArrayList;
import java.util.*;
@ -64,15 +63,19 @@ public class RecalDataManager {
throw new RuntimeException(String.format("BUG: adding incorrect read group datum %s to RecalDataManager for %s", datum.readGroup, this.readGroup));
}
if ( getRecalData(datum.pos, datum.qual, datum.dinuc) != null )
throw new RuntimeException(String.format("Duplicate entry discovered: %s vs. %s", getRecalData(datum.pos, datum.qual, datum.dinuc), datum));
RecalData prev = getRecalData(datum.pos, datum.qual, datum.dinuc);
if ( prev != null ) {
if ( trackDinuc && trackPos )
throw new RuntimeException(String.format("Duplicate entry discovered: %s vs. %s", getRecalData(datum.pos, datum.qual, datum.dinuc), datum));
prev.inc(datum);
} else {
int posIndex = getPosIndex(datum.pos);
int internalDinucIndex = getDinucIndex(datum.dinuc);
if ( internalDinucIndex == -1 ) return;
int posIndex = getPosIndex(datum.pos);
int internalDinucIndex = getDinucIndex(datum.dinuc);
if ( internalDinucIndex == -1 ) return;
set(posIndex, datum.qual, internalDinucIndex, datum);
flattenData.add(datum);
set(posIndex, datum.qual, internalDinucIndex, datum);
flattenData.add(datum);
}
}
public RecalData getRecalData(int pos, int qual, String dinuc) {
@ -157,47 +160,41 @@ public class RecalDataManager {
return l;
}
*/
public List<RecalData> getDataByPos() {
List<RecalData> l = new ArrayList<RecalData>(data.length);
for ( int pos = 0; pos < maxReadLen; pos++ ) {
for ( int qual = 0; qual < QualityUtils.MAX_QUAL_SCORE+1; qual++ ) {
RecalData datum = new RecalData(pos, qual, readGroup, "**");
for ( int dinucIndex = 0; dinucIndex < nDinucs; dinucIndex++ ) {
RecalData datum2 = getRecalData(pos, qual, RecalData.dinucIndex2bases(dinucIndex));
if ( datum2 != null )
datum.inc(data[pos][qual][dinucIndex].N, data[pos][qual][dinucIndex].B);
}
if ( datum.N > 0 ) l.add(datum);
private RecalData getMatchingDatum(List<RecalData> l, RecalData datum,
boolean combinePos, boolean combineQual, boolean combineDinuc) {
for ( RecalData find : l ) {
if ( (combineQual || find.qual == datum.qual) &&
(combinePos || find.pos == datum.pos ) &&
(combineDinuc || find.dinuc.equals(datum.dinuc)) ) {
//System.out.printf("Found %s for %s%n", find, datum);
return find;
}
}
System.out.printf("getDataByPos => %d%n", l.size());
return l;
RecalData d = new RecalData(combinePos ? 0 : datum.pos, combineQual ? 0 : datum.qual, datum.readGroup, combineDinuc ? "**" : datum.dinuc );
//System.out.printf("Making new match %s%n", d);
l.add(d);
return d;
}
public List<RecalData> getDataByDinuc() {
List<RecalData> l = new ArrayList<RecalData>(nDinucs);
public List<RecalData> combineDinucs() {
return combine(false, false, true);
}
for ( int dinucIndex = 0; dinucIndex < nDinucs; dinucIndex++ ) {
for ( int qual = 0; qual < QualityUtils.MAX_QUAL_SCORE+1; qual++ ) {
RecalData datum = new RecalData(-1, qual, readGroup, RecalData.dinucIndex2bases(dinucIndex));
System.out.printf("Aggregating [%s]:%n", datum);
for ( int pos = 0; pos < data.length; pos++ ) {
RecalData datum2 = getRecalData(pos, qual, RecalData.dinucIndex2bases(dinucIndex));
if ( datum2 != null ) {
System.out.printf(" + [%s]:%n", datum2);
datum.inc(data[pos][qual][dinucIndex].N, data[pos][qual][dinucIndex].B);
}
}
if ( datum.N > 0 ) l.add(datum);
System.out.printf(" %s [%s]:%n", datum.N > 0 ? "=>" : "<>", datum);
}
public List<RecalData> combineCycles() {
return combine(true, false, false);
}
public List<RecalData> combine(boolean ignorePos, boolean ignoreQual, boolean ignoreDinuc) {
List<RecalData> l = new ArrayList<RecalData>();
for ( RecalData datum : flattenData ) {
RecalData reduced = getMatchingDatum(l, datum, ignorePos, ignoreQual, ignoreDinuc);
//System.out.printf("Combining %s with %s%n", datum, reduced);
reduced.inc(datum);
}
System.out.printf("getDataByDinuc => %d%n", l.size());
return l;
}*/
}
public List<RecalData> getAll() {
return flattenData;

View File

@ -1,3 +1,28 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.*;
@ -24,6 +49,9 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
@Argument(shortName="outputBAM", doc="output BAM file", required=false)
public String outputBamFile = null;
@Argument(shortName="rawQempirical", doc="If provide, we will use raw Qempirical scores calculated from the # mismatches and # bases, rather than the more conservative estimate of # mismatches + 1 / # bases + 1", required=false)
public boolean rawQempirical = false;
private static Logger logger = Logger.getLogger(TableRecalibrationWalker.class);
private final static boolean DEBUG = false;
@ -31,8 +59,18 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
// maps from [readGroup] -> [prevBase x base -> [cycle, qual, new qual]]
HashMap<String, RecalMapping> cache = new HashMap<String, RecalMapping>();
//@Argument(shortName="serial", doc="", required=false)
//boolean serialRecalibration = false;
public enum RecalibrationMode {
COMBINATORIAL,
JAFFE,
BY_POS_ONLY,
BY_DINUC_ONLY,
SEQUENTIAL,
ERROR
}
@Argument(shortName="mode", doc="", required=false)
public String modeString = RecalibrationMode.SEQUENTIAL.toString();
public RecalibrationMode mode = RecalibrationMode.ERROR;
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
private static Pattern COLLAPSED_POS_PATTERN = Pattern.compile("^#\\s+collapsed_pos\\s+(\\w+)");
@ -40,6 +78,23 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
private static Pattern HEADER_PATTERN = Pattern.compile("^rg.*");
public void initialize() {
//
// crap hack until Enum arg types are supported
//
for ( RecalibrationMode potential : RecalibrationMode.values() ) {
if ( potential.toString().equals(modeString)) {
mode = potential;
break;
}
}
if ( mode == RecalibrationMode.ERROR )
throw new RuntimeException("Unknown mode requested: " + modeString);
//
// Walk over the data file, parsing it into recalibration data
//
int lineNumber = 0;
try {
System.out.printf("Reading data...%n");
List<RecalData> data = new ArrayList<RecalData>();
@ -48,13 +103,12 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
List<String> lines = new xReadLines(new File(paramsFile)).readLines();
for ( String line : lines ) {
//System.out.printf("Reading line %s%n", line);
lineNumber++;
if ( HEADER_PATTERN.matcher(line).matches() )
continue;
if ( COMMENT_PATTERN.matcher(line).matches() ) {
collapsedPos = parseCommentLine(COLLAPSED_POS_PATTERN, line, collapsedPos);
collapsedDinuc = parseCommentLine(COLLAPSED_DINUC_PATTERN, line, collapsedDinuc);
//System.out.printf("Collapsed %b %b%n", collapsedPos, collapsedDinuc);
}
else {
data.add(RecalData.fromCSVString(line));
@ -63,13 +117,14 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
initializeCache(data, collapsedPos, collapsedDinuc);
} catch ( FileNotFoundException e ) {
Utils.scareUser("Cannot read/find parameters file " + paramsFile);
} catch ( NumberFormatException e ) {
throw new RuntimeException("Error parsing recalibration data at line " + lineNumber, e);
}
}
private boolean parseCommentLine(Pattern pat, String line, boolean flag) {
Matcher m = pat.matcher(line);
if ( m.matches() ) {
//System.out.printf("Parsing %s%n", m.group(1));
flag = Boolean.parseBoolean(m.group(1));
}
@ -95,15 +150,40 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
logger.info(String.format("Max pos : %d", maxPos));
logger.info(String.format("Max Q reported : %d", maxQReported));
// initialize the data structure
// check the mode
switch ( mode ) {
case JAFFE:
collapsedPos = collapsedDinuc = true;
break;
case BY_POS_ONLY:
if ( collapsedPos )
throw new RuntimeException(String.format("Cannot perform position_only recalibration -- data is already partially collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
collapsedPos = true;
break;
case BY_DINUC_ONLY:
if ( collapsedDinuc )
throw new RuntimeException(String.format("Cannot perform dinuc_only recalibration -- data is already partially collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
collapsedDinuc = true;
break;
case COMBINATORIAL:
if ( collapsedPos || collapsedDinuc )
throw new RuntimeException(String.format("Cannot perform combinatorial recalibration -- data is already collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
case SEQUENTIAL:
if ( collapsedPos || collapsedDinuc )
throw new RuntimeException(String.format("Cannot perform sequential recalibration -- data is already collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
}
//
// initialize the data structures
//
HashMap<String, RecalDataManager> managers = new HashMap<String, RecalDataManager>();
for ( String readGroup : readGroups ) {
// create a manager for each read group
RecalDataManager manager = new RecalDataManager(readGroup, ! collapsedPos, ! collapsedDinuc);
//RecalDataManager manager = new RecalDataManager(readGroup, maxPos, maxQReported, dinucs.size(), ! collapsedPos, ! collapsedDinuc);
managers.put(readGroup, manager);
}
// fill in the manager structure
// add the data to their appropriate readGroup managers
for ( RecalData datum : data ) {
managers.get(datum.readGroup).addDatum(datum);
}
@ -111,15 +191,36 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
// fill in the table with mapping objects
for ( String readGroup : readGroups ) {
RecalDataManager manager = managers.get(readGroup);
RecalMapping mapper = null;
//if ( serialRecalibration )
// mapper = new SerialRecalMapping(manager, dinucs, maxPos, maxQReported);
//else
mapper = new CombinatorialRecalMapping(manager, dinucs, maxPos, maxQReported);
RecalMapping mapper = initializeMapper(manager, rawQempirical, dinucs, maxPos, maxQReported);
logger.info(String.format("Creating mapper for %s of class %s", readGroup, mapper));
cache.put(readGroup, mapper);
}
}
/**
* Returns a new RecalMapping object appropriate for the requested mode in this.mode for the RecalData
* in manager. Uses the dinucs set dinucs, maxPos, and maxQReported to build efficient lookup structures
* for mapping from old qual and features -> new qual
*
* @param manager
* @param dinucs
* @param maxPos
* @param maxQReported
* @return
*/
private RecalMapping initializeMapper( RecalDataManager manager, final boolean rawQempirical,
Set<String> dinucs, int maxPos, int maxQReported ) {
switch ( mode ) {
case COMBINATORIAL:
case JAFFE:
return new CombinatorialRecalMapping(manager, rawQempirical, dinucs, maxPos, maxQReported);
case SEQUENTIAL:
return new SerialRecalMapping(manager, rawQempirical, dinucs, maxPos, maxQReported);
default:
throw new RuntimeException("Unimplemented recalibration mode: " + mode);
}
}
public SAMRecord map(char[] ref, SAMRecord read) {
byte[] bases = read.getReadBases();
byte[] quals = read.getBaseQualities();
@ -132,16 +233,21 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
byte[] recalQuals = recalibrateBasesAndQuals(read.getAttribute("RG").toString(), bases, quals);
if (read.getReadNegativeStrandFlag())
if (read.getReadNegativeStrandFlag()) // reverse the quals for the neg strand read
recalQuals = BaseUtils.reverse(recalQuals);
//if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") )
// System.out.printf("OLD: %s%n", read.format());
read.setBaseQualities(recalQuals);
//if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") )
// System.out.printf("NEW: %s%n", read.format());
return read;
}
/**
* Workhorse routine. Given a read group and an array of bases and quals, returns a new set of recalibrated
* qualities for each base/qual in bases and quals. Uses the RecalMapping object associated with readGroup.
*
* @param readGroup
* @param bases
* @param quals
* @return
*/
public byte[] recalibrateBasesAndQuals(final String readGroup, byte[] bases, byte[] quals) {
byte[] recalQuals = new byte[quals.length];
RecalMapping mapper = cache.get(readGroup);
@ -150,9 +256,6 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
for ( int cycle = 1; cycle < bases.length; cycle++ ) { // skip first and last base, qual already set because no dinuc
byte qual = quals[cycle];
byte newQual = mapper.getNewQual(readGroup, bases[cycle - 1], bases[cycle], cycle, qual);
//if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") )
// System.out.printf("Processing cycle=%d qual=%d: neg?=%b => %d at %s%n",
// cycle, qual, read.getReadNegativeStrandFlag(), newQual, read.getReadName());
recalQuals[cycle] = newQual;
//System.out.printf("Mapping %d => %d%n", qual, newQual);
}
@ -160,6 +263,11 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
return recalQuals;
}
// --------------------------------------------------------------------------------------------------------------
//
// Standard I/O routines
//
// --------------------------------------------------------------------------------------------------------------
public void onTraversalDone(SAMFileWriter output) {
if ( output != null ) {
output.close();
@ -167,7 +275,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
}
public SAMFileWriter reduceInit() {
if ( outputBamFile != null ) { // ! outputBamFile.equals("") ) {
if ( outputBamFile != null ) {
SAMFileHeader header = this.getToolkit().getEngine().getSAMHeader();
return Utils.createSAMFileWriterWithCompression(header, true, outputBamFile, getToolkit().getBAMCompression());
}
@ -177,8 +285,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
}
/**
* Summarize the error rate data.
*
* Write out the read
*/
public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) {
if ( output != null ) {
@ -199,7 +306,8 @@ class CombinatorialRecalMapping implements RecalMapping {
ArrayList<byte[][]> cache;
RecalDataManager manager;
public CombinatorialRecalMapping(RecalDataManager manager, Set<String> dinucs, int maxPos, int maxQReported ) {
public CombinatorialRecalMapping(RecalDataManager manager, final boolean useRawQempirical,
Set<String> dinucs, int maxPos, int maxQReported ) {
this.manager = manager;
// initialize the data structure
@ -211,11 +319,12 @@ class CombinatorialRecalMapping implements RecalMapping {
for ( RecalData datum : manager.getAll() ) {
//System.out.printf("Adding datum %s%n", datum);
byte [][] table = cache.get(this.manager.getDinucIndex(datum.dinuc));
if ( table[datum.pos][datum.qual] != 0 )
int pos = manager.canonicalPos(datum.pos);
if ( table[pos][datum.qual] != 0 )
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
//table[datum.pos][datum.qual] = (byte)(1 + datum.empiricalQualByte());
table[datum.pos][datum.qual] = datum.empiricalQualByte();
// System.out.printf("Binding %d %d => %d%n", datum.pos, datum.qual, datum.empiricalQualByte());
table[pos][datum.qual] = datum.empiricalQualByte(useRawQempirical);
//System.out.printf("Binding %d %d => %d%n", pos, datum.qual, datum.empiricalQualByte(useRawQempirical));
}
}
@ -241,102 +350,154 @@ class CombinatorialRecalMapping implements RecalMapping {
}
}
/*class CombinatorialRecalMapping implements RecalMapping {
HashMap<String, byte[][]> cache = new HashMap<String, byte[][]>();
/**
* Implements a serial recalibration of the reads using the combinational table. First,
* we perform a positional recalibration, and then a subsequent dinuc correction.
*
* Given the full recalibration table, we perform the following preprocessing steps:
*
* - calculate the global quality score shift across all data [DeltaQ]
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
* - The final shift equation is:
*
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc)
*/
class SerialRecalMapping implements RecalMapping {
private double globalDeltaQ = 0.0;
private double[][] deltaQPosMap, deltaQDinucMap;
double [] deltaQualMap;
RecalData [][] qPosSupports, qDinucSupports;
CombinatorialRecalMapping combiMap;
RecalDataManager manager;
public CombinatorialRecalMapping(RecalDataManager manager, Set<String> dinucs, int maxPos, int maxQReported ) {
String dinucToLookAt = null; // "CC";
int posToLookAt = 0;
int qualToLookAt = 25;
public SerialRecalMapping(RecalDataManager manager, final boolean useRawQempirical,
Set<String> dinucs, int maxPos, int maxQReported ) {
//combiMap = new CombinatorialRecalMapping(manager, dinucs, maxPos, maxQReported );
this.manager = manager;
// initialize the data structure
for ( String dinuc : dinucs ) {
byte[][] table = new byte[maxPos+1][maxQReported+1];
cache.put(dinuc, table);
{
RecalData datum = new RecalData(0, 0, manager.readGroup, "**").inc(manager.getAll());
double aggregrateQreported = RecalData.combinedQreported(manager.getAll());
globalDeltaQ = datum.empiricalQualDouble(useRawQempirical) - aggregrateQreported;
System.out.printf("Global quality score shift is %.2f - %.2f = %.2f%n", datum.empiricalQualDouble(useRawQempirical), aggregrateQreported, globalDeltaQ);
}
for ( RecalData datum : manager.getAll() ) {
//System.out.printf("Adding datum %s%n", datum);
byte [][] table = cache.get(datum.dinuc);
if ( table[datum.pos][datum.qual] != 0 )
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
//table[datum.pos][datum.qual] = (byte)(1 + datum.empiricalQualByte());
table[datum.pos][datum.qual] = datum.empiricalQualByte();
// System.out.printf("Binding %d %d => %d%n", datum.pos, datum.qual, datum.empiricalQualByte());
if ( printStateP(datum) )
System.out.printf("PrintValue: %s%n", datum);
}
// Jaffe-style
deltaQualMap = new double[maxQReported+1];
for ( RecalData datum : RecalData.sort(manager.combine(true, false, true)) ) {
deltaQualMap[datum.qual] = datum.empiricalQualDouble(useRawQempirical) - datum.qual - globalDeltaQ;
System.out.printf("%s => %s%n", datum, deltaQualMap[datum.qual]);
}
// calculate the delta Q pos array
deltaQPosMap = new double[maxPos+1][maxQReported+1];
qPosSupports = new RecalData[maxPos+1][maxQReported+1];
for ( RecalData datumAtPosQual : manager.combineDinucs() ) {
double offset = globalDeltaQ + deltaQualMap[datumAtPosQual.qual];
updateCache(qPosSupports, datumAtPosQual, useRawQempirical, deltaQPosMap, datumAtPosQual.pos, datumAtPosQual.qual, offset);
}
// calculate the delta Q dinuc array
deltaQDinucMap = new double[dinucs.size()+1][maxQReported+1];
qDinucSupports = new RecalData[dinucs.size()+1][maxQReported+1];
for ( RecalData datumAtDinucQual : manager.combineCycles() ) {
double offset = globalDeltaQ + deltaQualMap[datumAtDinucQual.qual];
updateCache(qDinucSupports, datumAtDinucQual, useRawQempirical, deltaQDinucMap, datumAtDinucQual.getDinucIndex(), datumAtDinucQual.qual, offset);
}
validateTables(maxPos, maxQReported, dinucs.size(), deltaQPosMap, deltaQDinucMap, useRawQempirical, manager.getAll());
}
public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) {
//String dinuc = String.format("%c%c", (char)prevBase, (char)base);
//if ( qual == 2 )
// System.out.printf("Qual = 2%n");
byte[] bp = {prevBase, base};
String dinuc = manager.canonicalDinuc(new String(bp));
int pos = manager.canonicalPos(cycle);
byte[][] dataTable = cache.get(dinuc);
if ( dataTable == null && prevBase != 'N' && base != 'N' )
throw new RuntimeException(String.format("Unmapped data table at %s %s", readGroup, dinuc));
byte result = dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual;
//if ( result == 2 )
// System.out.printf("Lookup RG=%s dinuc=%s cycle=%d pos=%d qual=%d datatable=%s / %d => %d%n",
// readGroup, dinuc, cycle, pos, qual, dataTable, dataTable.length, result);
return result;
}
}*/
/*
class SerialRecalMapping implements RecalMapping {
// mapping from dinuc x Q => new Q
HashMap<String, byte[]> mappingByDinuc;
// mapping from pos x Q => new Q
byte[][] mappingByPos;
public SerialRecalMapping(RecalDataManager manager, Set<String> dinucs, int maxPos, int maxQReported ) {
mappingByDinuc = new HashMap<String, byte[]>();
for ( String dinuc : dinucs ) {
byte[] table = new byte[maxQReported+1];
mappingByDinuc.put(dinuc, table);
}
for ( RecalData datum : manager.getDataByDinuc() ) {
//System.out.printf("Adding datum %s%n", datum);
if ( mappingByDinuc.get(datum.dinuc).length <= datum.qual ) {
throw new RuntimeException(String.format("Unexpectedly massive Q score of %d found, calculated max was %d%n", maxQReported, datum.qual));
private void validateTables(int maxPos, int maxQReported, int nDinucs,
double[][] deltaQPosMap, double[][] deltaQDinucMap,
final boolean useRawQempirical, List<RecalData> data ) {
for ( int i = 0; i < maxPos; i++ ) {
for ( int j = 0; j < maxQReported; j++ ) {
if ( printStateP(i, null, j) )
System.out.printf("Mapping: pos=%d qual=%2d delta=%.2f based on %s%n",
i, j, deltaQPosMap[i][j], qPosSupports[i][j]);
}
mappingByDinuc.get(datum.dinuc)[datum.qual] = datum.empiricalQualByte();
}
// initialize the mapping by position
mappingByPos = new byte[maxPos+1][maxQReported+1];
for ( RecalData datum : manager.getDataByPos() ) {
//System.out.printf("Adding datum %s%n", datum);
mappingByPos[datum.pos][datum.qual] = datum.empiricalQualByte();
for ( int i = 0; i < nDinucs; i++ ) {
for ( int j = 0; j < maxQReported; j++ ) {
String dinuc = RecalData.dinucIndex2bases(i);
if ( printStateP(0, dinuc, j ) )
System.out.printf("Mapping: dinuc=%s qual=%2d delta=%.2f based on %s%n",
dinuc, j, deltaQDinucMap[i][j], qDinucSupports[i][j]);
}
}
for ( RecalData datum : RecalData.sort(data) ) {
byte newQual = getNewQual(datum.readGroup, (byte)datum.dinuc.charAt(0), (byte)datum.dinuc.charAt(1), datum.pos, (byte)datum.qual);
if ( printStateP( datum ) ) {
System.out.printf("Serial mapping %s => %d => %d vs. %d => delta = %d%n",
datum, newQual, datum.empiricalQualByte(useRawQempirical), datum.empiricalQualByte(), newQual - datum.empiricalQualByte(useRawQempirical));
}
}
}
private void updateCache( RecalData[][] supports, RecalData datum, final boolean useRawQempirical,
double[][] table, int i, int j, double meanQ ) {
if ( table[i][j] != 0 )
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
double deltaQ = datum.empiricalQualDouble(useRawQempirical) - datum.qual - meanQ;
table[i][j] = deltaQ;
supports[i][j] = datum;
}
private boolean printStateP( int cycle, String dinuc, int qual ) {
return posToLookAt != -1 &&
( cycle == 0 || posToLookAt == 0 || cycle == posToLookAt ) &&
( dinucToLookAt == null || dinuc == null || manager.getDinucIndex(dinuc) == -1 || dinucToLookAt.equals(dinuc)) &&
( qualToLookAt == 0 || qual == qualToLookAt );
}
private boolean printStateP( RecalData datum ) {
return printStateP(datum.pos, datum.dinuc, datum.qual);
}
public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) {
//System.out.printf("Lookup RG=%s prevBase=%c base=%c cycle=%d qual=%d%n", readGroup, prevBase, base, cycle, qual);
//String dinuc = String.format("%c%c", (char)prevBase, (char)base);
byte[] bp = {prevBase, base};
String dinuc = new String(bp);
int pos = manager.canonicalPos(cycle);
int index = this.manager.getDinucIndex(prevBase, base);
byte newQualFromDinuc = 0;
byte newQualFromPos = cycle > 0 && cycle < mappingByPos.length ? mappingByPos[cycle][qual] : qual;
byte newQual = newQualFromPos;
if ( prevBase != 'N' && base != 'N' ) {
// if the qual got mapped too high, assume it's the best we've seen for recalibration purposes
int newQualIndex = newQual < mappingByDinuc.get(dinuc).length ? newQual : mappingByDinuc.get(dinuc).length - 1;
newQualFromDinuc = mappingByDinuc.get(dinuc)[newQualIndex];
newQual = newQualFromDinuc;
}
double deltaQual = deltaQualMap[qual];
double deltaQPos = pos >= deltaQPosMap.length ? 0.0 : deltaQPosMap[pos][qual]; // == length indices no data for last base
double deltaQDinuc = index == -1 ? 0.0 : deltaQDinucMap[index][qual]; // -1 indices N in prev or current base
double newQual = qual + globalDeltaQ + deltaQual + deltaQPos + deltaQDinuc;
byte newQualByte = QualityUtils.boundQual((int)Math.round(newQual), QualityUtils.MAX_REASONABLE_Q_SCORE);
//System.out.printf("Lookup RG=%s prevBase=%c base=%c cycle=%d qual=%d => %d => %d => %d%n",
// readGroup, prevBase, base, cycle, qual, newQualFromPos, newQualFromDinuc, newQual);
//if ( printStateP(pos, RecalData.dinucIndex2bases(index), qual) )
// System.out.printf("%s %c%c %d %d => %d + %.2f + %.2f + %.2f + %.2f = %d%n",
// readGroup, prevBase, base, cycle, qual,
// qual, globalDeltaQ, deltaQual, deltaQPos, deltaQDinuc,
// newQualByte);
return newQual;
//byte combiQualByte = combiMap.getNewQual(readGroup, prevBase, base, cycle, qual);
//System.out.printf("%s %c%c %d %d => %d + %.2f + %.2f + %.2f = %d vs %d (%d)%n",
// readGroup, prevBase, base, cycle, qual,
// qual, globalDeltaQ, deltaQPos, deltaQDinuc,
// newQualByte, combiQualByte, newQualByte - combiQualByte);
if ( newQualByte <= 0 && newQualByte >= QualityUtils.MAX_REASONABLE_Q_SCORE )
throw new RuntimeException(String.format("Illegal base quality score calculated: %s %c%c %d %d => %d + %.2f + %.2f + %.2f = %d",
readGroup, prevBase, base, cycle, qual, qual, globalDeltaQ, deltaQPos, deltaQDinuc, newQualByte));
return newQualByte;
}
}*/
}

View File

@ -67,6 +67,14 @@ public class QualityUtils {
return b;
}
static public double phredScaleCorrectRate(double trueRate) {
return phredScaleErrorRate(1-trueRate);
}
static public double phredScaleErrorRate(double errorRate) {
return -10.0*Math.log10(errorRate);
}
/**
* Return a quality score, capped at 63.
*
@ -74,7 +82,11 @@ public class QualityUtils {
* @return the capped quality score
*/
static public byte boundQual(int qual) {
return (byte) Math.min(qual, MAX_QUAL_SCORE);
return boundQual(qual, MAX_QUAL_SCORE);
}
static public byte boundQual(int qual, byte maxQual) {
return (byte) Math.min(qual, maxQual);
}
/**