diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java index f3d9e9efc..47a4d721b 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -33,7 +33,7 @@ public class CovariateCounterWalker extends LocusWalker { //@Argument(fullName="MAX_READ_GROUPS", shortName="mrg", required=false, doc="Abort if number of read groups in input file exceeeds this count.") //public int MAX_READ_GROUPS = 100; - + @Argument(fullName="PLATFORM", shortName="pl", required=false, doc="Only calibrate read groups generated from the given platform (default = * for all platforms)") public List platforms = Collections.singletonList("*"); //public List platforms = Collections.singletonList("ILLUMINA"); @@ -50,10 +50,14 @@ public class CovariateCounterWalker extends LocusWalker { long counted_bases = 0; // number of bases used to count covariates long skipped_sites = 0; // number of sites skipped because of a dbSNP entry + PrintStream recalTableOut = null; + public void initialize() { - //if( getToolkit().getEngine().getSAMHeader().getReadGroups().size() > MAX_READ_GROUPS ) - // Utils.scareUser("Number of read groups in the specified file exceeds the number that can be processed in a reasonable amount of memory." + - // "To override this limit, use the --MAX_READ_GROUPS (-mrg) parameter"); + try { + recalTableOut = new PrintStream( OUTPUT_FILEROOT+".recal_data.csv" ); + } catch ( FileNotFoundException e ) { + throw new RuntimeException("Couldn't open output file", e); + } for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) { if( readGroup.getAttribute("PL") == null ) @@ -68,12 +72,31 @@ public class CovariateCounterWalker extends LocusWalker { out.printf("Created recalibration data collectors for %d read group(s)%n", data.size()); } + /** + * Get the particular RecalData datum associated with readGroup, at machine pos, with reported + * quality qual, and with the dinuc context of prevBase, base. If an example of such a + * base has been seen before, returns the associated RecalData. If not, it creates one, places it in the + * system so that subsequent requests will return that object, and returns it. + * + * @param readGroup + * @param pos + * @param qual + * @param prevBase + * @param base + * @return + */ private RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) { byte[] cs = {(byte)prevBase, (byte)base}; String s = new String(cs); return data.get(readGroup).expandingGetRecalData(pos, qual, s, true); } + /** + * Get a list of all of the RecalData associated with readGroup + * + * @param readGroup + * @return + */ private List getRecalData(String readGroup) { return data.get(readGroup).getAll(); } @@ -110,6 +133,20 @@ public class CovariateCounterWalker extends LocusWalker { return 1; } + /** + * Updates the recalibration data for the base at offset in the read, associated with readGroup rg. + * Correctly handles machine orientation of the read. I.e., it adds data not by offset in the read + * but by implied machine cycle associated with the offset. + * + * TODO: this whole system is 0-based and therefore inconsisent with the rest of the GATK, where pos is 1-based + * TODO: and offset is 0-based. How very annoying. + * + * @param rg + * @param read + * @param offset + * @param ref + * @return + */ private int updateDataFromRead( String rg, SAMRecord read, int offset, char ref ) { int cycle = offset; byte[] bases = read.getReadBases(); @@ -126,13 +163,7 @@ public class CovariateCounterWalker extends LocusWalker { } int qual = quals[offset]; - if ( qual > 0 ) { // && qual <= QualityUtils.MAX_QUAL_SCORE ) { - // previous base is the next base in terms of machine chemistry if this is a negative strand - // if ( qual == 2 ) - //if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") ) - // System.out.printf("Adding neg?=%b b_offset=%c offset=%d cycle=%d qual=%d dinuc=%c%c ref_match=%c comp=%c name=%s%n", - // read.getReadNegativeStrandFlag(), (char)read.getReadBases()[offset], offset, cycle, qual, prevBase, base, - // ref, (char)BaseUtils.simpleComplement(ref), read.getReadName()); + if ( qual > 0 ) { RecalData datum = getRecalData(rg, cycle, qual, prevBase, base); if (datum != null) datum.inc(base,ref); return 1; @@ -153,6 +184,10 @@ public class CovariateCounterWalker extends LocusWalker { //out.printf("...done%n"); } + /** + * Prints some basic information about the CountCovariates run to the output stream out + * @param out + */ private void printInfo(PrintStream out) { out.printf("# date %s%n", new Date()); out.printf("# collapsed_pos %b%n", collapsePos); @@ -163,6 +198,7 @@ public class CovariateCounterWalker extends LocusWalker { out.printf("# fraction_skipped 1/%.0f%n", (double)counted_sites / skipped_sites); } + @Deprecated private void writeLogisticRecalibrationTable() { PrintStream dinuc_out = null; try { @@ -189,36 +225,28 @@ public class CovariateCounterWalker extends LocusWalker { } } - private void writeRecalTable() { - PrintStream table_out = null; - try { - table_out = new PrintStream( OUTPUT_FILEROOT+".recal_data.csv"); - printInfo(table_out); - table_out.println("rg,dn,Qrep,pos,NBases,MMismatches,Qemp"); - for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) { - for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) { - if ( datum.N > 0 ) - table_out.format("%s%n", datum.toCSVString(collapsePos)); - } - } - } catch (FileNotFoundException e ) { - System.err.println("FileNotFoundException: " + e.getMessage()); - } - finally { - if (table_out != null) table_out.close(); - } - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer a, Integer b) { - return 0; + /** + * Writes out the key recalibration data collected from the reads. Dumps this recalibration data + * as a CVS string to the recalTableOut PrintStream. Emits the data for all read groups into this file. + */ + private void writeRecalTable() { + printInfo(recalTableOut); + recalTableOut.println("rg,pos,Qrep,dn,nBases,nMismatches,Qemp"); + for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) { + // TODO: should sort the data coming out of getRecalData here for easier processing + for ( RecalData datum: RecalData.sort(getRecalData(readGroup.getReadGroupId())) ) { + if ( datum.N > 0 ) + recalTableOut.format("%s%n", datum.toCSVString(collapsePos)); + } + } + recalTableOut.close(); } /** - * Check to see whether this read group should be processed. + * Check to see whether this read group should be processed. Returns true if the + * read group is in the list of platforms to process or the platform == *, indicating + * that all platforms should be processed. + * * @param readGroup * @return */ @@ -233,4 +261,24 @@ public class CovariateCounterWalker extends LocusWalker { return false; } + + /** + * No initialization routines + * + * @return + */ + public Integer reduceInit() { + return 0; + } + + /** + * Doesn't do anything + * + * @param a + * @param b + * @return + */ + public Integer reduce(Integer a, Integer b) { + return 0; + } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalData.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalData.java index 7007b5c6d..d93574946 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalData.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalData.java @@ -3,14 +3,25 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.BaseUtils; -public class RecalData { - long N; - long B; - int pos; - int qual; - String readGroup; - String dinuc; +import java.util.List; +import java.util.Collections; +public class RecalData implements Comparable { + // + // context of this data -- read group, position in read, reported quality, and dinuc + // + String readGroup; // the read group where the datum was observed + int pos; // the position of this recalibration datum + int qual; // the reported quality scoree of this datum + String dinuc; // the dinucleotide context of this datum + + // + // empirical quality score data + // + long N; // total number of observed bases + long B; // number of observed mismatches to the reference base + + public RecalData(int pos, int qual, String readGroup, String dinuc ) { this.pos = pos; this.qual = qual; @@ -18,61 +29,164 @@ public class RecalData { this.dinuc = dinuc; } - public void inc(long incN, long incB) { + public RecalData( RecalData copy ) { + this.pos = copy.pos; + this.qual = copy.qual; + this.readGroup = copy.readGroup; + this.dinuc = copy.dinuc; + this.N = copy.N; + this.B = copy.B; + } + + public int compareTo(RecalData to) { + int cmpReadGroup = readGroup.compareTo(to.readGroup); + if (cmpReadGroup != 0) return cmpReadGroup; + + int cmpPos = new Integer(pos).compareTo(to.pos); + if (cmpPos != 0) return cmpPos; + + int cmpQual = new Integer(qual).compareTo(to.qual); + if (cmpQual != 0) return cmpQual; + + return dinuc.compareTo(to.dinuc); + } + + /** + * Returns the expected number of mismatching bases for this datum, using the number of bases N and + * the reported qual score. Effectively qual implied error rate * number of bases. + * + * @return + */ + public double getExpectedErrors() { + return QualityUtils.qualToErrorProb((byte)qual) * N; + } + + // + // Increment routines + // + public RecalData inc(long incN, long incB) { N += incN; B += incB; + return this; } - //public int getDinucIndex() { - // return string2dinucIndex(this.dinuc); - //} + /** + * Add N and B from other to self + * + * @param other + * @return + */ + public RecalData inc(final RecalData other) { + return inc(other.N, other.B); + } - public void inc(char curBase, char ref) { - inc(1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1); + /** + * Add all of the Ns and Bs for data to self + * + * @param data + * @return + */ + public RecalData inc(List data) { + for ( RecalData other : data ) { + this.inc(other); + } + return this; + } + + /** + * Count an empirical observation of our current base against the reference base. + * Increments N and, if curBase != ref, also increments B + * + * @param curBase + * @param ref + * @return + */ + public RecalData inc(char curBase, char ref) { + return inc(1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1); //out.printf("%s %s\n", curBase, ref); } + + /** + * Get the integer dinuc index associated with this datum. Can return -1 if dinuc + * contains unexpected bases. Returns 0 if dinuc contains the * operator (not tracking dinucs) + * @return + */ + public int getDinucIndex() { + return dinucIndex(this.dinuc); + } + + /** + * Calculates the empirical quality score for this datum from B and N. Bounds the result + * by QualityUtils.MAX_REASONABLE_Q_SCORE. + * + * @return + */ + public double empiricalQualDouble(final boolean useRawQempirical) { + double doubleB = useRawQempirical ? B : B + 1; + double doubleN = useRawQempirical ? N : N + 1; + double empiricalQual = -10 * Math.log10(doubleB / doubleN); + if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE; + return empiricalQual; + } + public double empiricalQualDouble() { return empiricalQualDouble(true); } + + + /** + * As as empiricalQualDouble, but rounded and encoded as a byte for placement in a read quality array of bytes + * @return + */ + public byte empiricalQualByte(final boolean useRawQempirical) { + double doubleB = useRawQempirical ? B : B + 1; + double doubleN = useRawQempirical ? N : N + 1; + return QualityUtils.probToQual(1.0 - doubleB / doubleN); + } + public byte empiricalQualByte() { return empiricalQualByte(true); } + + public static String headerString() { return ("pos, rg, dinuc, qual, emp_qual, qual_diff, n, b"); } - public double empiricalQualDouble() { - double empiricalQual = -10 * Math.log10((double)B / N); - if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE; - return empiricalQual; - } - - public byte empiricalQualByte() { - return QualityUtils.probToQual(1.0 - (double)B / N); - } - public String toString() { - double empiricalQual = empiricalQualDouble(); - return String.format("%3d,%s,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, readGroup, dinuc, qual, empiricalQual, qual-empiricalQual, N, B); + return String.format("[rg=%s, pos=%3d, Qrep=%3d, dinuc=%s, Qemp=%2.2f / %2.2f, B=%d, N=%d]", readGroup, pos, qual, dinuc, empiricalQualDouble(), empiricalQualDouble(false), B, N); + //return String.format("%3d,%s,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, readGroup, dinuc, qual, empiricalQual, qual-empiricalQual, N, B); } + // + // To and from CSV strings + // public String toCSVString(boolean collapsedPos) { - return String.format("%s,%s,%d,%s,%d,%d,%d", readGroup, dinuc, qual, collapsedPos ? "*" : pos, N, B, empiricalQualByte()); + // rg,pos,Qrep,dn,nBases,nMismatches,Qemp + return String.format("%s,%s,%d,%s,%d,%d,%d", readGroup, collapsedPos ? "*" : pos, qual, dinuc, N, B, empiricalQualByte()); } + /** + * Parses the string s and returns the encoded RecalData in it. Assumes data has been emitted by + * toCSVString() routine. Order is readGroup, dinuc, qual, pos, N, B, Qemp as byte + * @param s + * @return + */ public static RecalData fromCSVString(String s) { String[] vals = s.split(","); String rg = vals[0]; - String dinuc = vals[1]; + int pos = vals[1].equals("*") ? 0 : Integer.parseInt(vals[1]); int qual = Integer.parseInt(vals[2]); - - int pos = vals[3].equals("*") ? 0 : Integer.parseInt(vals[3]); + String dinuc = vals[3]; int N = Integer.parseInt(vals[4]); int B = Integer.parseInt(vals[5]); RecalData datum = new RecalData(pos, qual, rg, dinuc); datum.B = B; datum.N = N; - - //if ( datum.N > 0 ) System.out.printf("Parsing line [%s] => [%s]%n", s, datum); - return datum; } + + public static List sort(List l) { + Collections.sort(l); + return l; + } + public static int bases2dinucIndex(char prevBase, char base) { int pbI = BaseUtils.simpleBaseToBaseIndex(prevBase); int bI = BaseUtils.simpleBaseToBaseIndex(base); @@ -93,25 +207,17 @@ public class RecalData { return bases2dinucIndex((char)prevBase, (char)base); } -// -// private static int nuc2num[]; -// private static char num2nuc[]; -// -// static { -// nuc2num = new int[128]; -// nuc2num['A'] = 0; -// nuc2num['C'] = 1; -// nuc2num['G'] = 2; -// nuc2num['T'] = 3; -// nuc2num['a'] = 0; -// nuc2num['c'] = 1; -// nuc2num['g'] = 2; -// nuc2num['t'] = 3; -// -// num2nuc = new char[4]; -// num2nuc[0] = 'A'; -// num2nuc[1] = 'C'; -// num2nuc[2] = 'G'; -// num2nuc[3] = 'T'; -// } + public static double combinedQreported(List l) { + double sumExpectedErrors = 0; + long nBases = 0; + for ( RecalData datum : l ) { + sumExpectedErrors += datum.getExpectedErrors(); + nBases += datum.N; + } + + double q = QualityUtils.phredScaleErrorRate(sumExpectedErrors / nBases); + System.out.printf("expected errors=%f, nBases = %d, rate=%f, qual=%f%n", + sumExpectedErrors, nBases, 1 - sumExpectedErrors / nBases, q); + return q; + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index 54cc64307..4950a91eb 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -1,6 +1,5 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.ExpandingArrayList; import java.util.*; @@ -64,15 +63,19 @@ public class RecalDataManager { throw new RuntimeException(String.format("BUG: adding incorrect read group datum %s to RecalDataManager for %s", datum.readGroup, this.readGroup)); } - if ( getRecalData(datum.pos, datum.qual, datum.dinuc) != null ) - throw new RuntimeException(String.format("Duplicate entry discovered: %s vs. %s", getRecalData(datum.pos, datum.qual, datum.dinuc), datum)); + RecalData prev = getRecalData(datum.pos, datum.qual, datum.dinuc); + if ( prev != null ) { + if ( trackDinuc && trackPos ) + throw new RuntimeException(String.format("Duplicate entry discovered: %s vs. %s", getRecalData(datum.pos, datum.qual, datum.dinuc), datum)); + prev.inc(datum); + } else { + int posIndex = getPosIndex(datum.pos); + int internalDinucIndex = getDinucIndex(datum.dinuc); + if ( internalDinucIndex == -1 ) return; - int posIndex = getPosIndex(datum.pos); - int internalDinucIndex = getDinucIndex(datum.dinuc); - if ( internalDinucIndex == -1 ) return; - - set(posIndex, datum.qual, internalDinucIndex, datum); - flattenData.add(datum); + set(posIndex, datum.qual, internalDinucIndex, datum); + flattenData.add(datum); + } } public RecalData getRecalData(int pos, int qual, String dinuc) { @@ -157,47 +160,41 @@ public class RecalDataManager { return l; } + */ - public List getDataByPos() { - List l = new ArrayList(data.length); - for ( int pos = 0; pos < maxReadLen; pos++ ) { - for ( int qual = 0; qual < QualityUtils.MAX_QUAL_SCORE+1; qual++ ) { - RecalData datum = new RecalData(pos, qual, readGroup, "**"); - for ( int dinucIndex = 0; dinucIndex < nDinucs; dinucIndex++ ) { - RecalData datum2 = getRecalData(pos, qual, RecalData.dinucIndex2bases(dinucIndex)); - if ( datum2 != null ) - datum.inc(data[pos][qual][dinucIndex].N, data[pos][qual][dinucIndex].B); - } - if ( datum.N > 0 ) l.add(datum); + private RecalData getMatchingDatum(List l, RecalData datum, + boolean combinePos, boolean combineQual, boolean combineDinuc) { + for ( RecalData find : l ) { + if ( (combineQual || find.qual == datum.qual) && + (combinePos || find.pos == datum.pos ) && + (combineDinuc || find.dinuc.equals(datum.dinuc)) ) { + //System.out.printf("Found %s for %s%n", find, datum); + return find; } } - - System.out.printf("getDataByPos => %d%n", l.size()); - return l; + RecalData d = new RecalData(combinePos ? 0 : datum.pos, combineQual ? 0 : datum.qual, datum.readGroup, combineDinuc ? "**" : datum.dinuc ); + //System.out.printf("Making new match %s%n", d); + l.add(d); + return d; } - public List getDataByDinuc() { - List l = new ArrayList(nDinucs); + public List combineDinucs() { + return combine(false, false, true); + } - for ( int dinucIndex = 0; dinucIndex < nDinucs; dinucIndex++ ) { - for ( int qual = 0; qual < QualityUtils.MAX_QUAL_SCORE+1; qual++ ) { - RecalData datum = new RecalData(-1, qual, readGroup, RecalData.dinucIndex2bases(dinucIndex)); - System.out.printf("Aggregating [%s]:%n", datum); - for ( int pos = 0; pos < data.length; pos++ ) { - RecalData datum2 = getRecalData(pos, qual, RecalData.dinucIndex2bases(dinucIndex)); - if ( datum2 != null ) { - System.out.printf(" + [%s]:%n", datum2); - datum.inc(data[pos][qual][dinucIndex].N, data[pos][qual][dinucIndex].B); - } - } - if ( datum.N > 0 ) l.add(datum); - System.out.printf(" %s [%s]:%n", datum.N > 0 ? "=>" : "<>", datum); - } + public List combineCycles() { + return combine(true, false, false); + } + + public List combine(boolean ignorePos, boolean ignoreQual, boolean ignoreDinuc) { + List l = new ArrayList(); + for ( RecalData datum : flattenData ) { + RecalData reduced = getMatchingDatum(l, datum, ignorePos, ignoreQual, ignoreDinuc); + //System.out.printf("Combining %s with %s%n", datum, reduced); + reduced.inc(datum); } - - System.out.printf("getDataByDinuc => %d%n", l.size()); return l; - }*/ + } public List getAll() { return flattenData; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index b45d6d0fa..8b50ae1d6 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -1,3 +1,28 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.recalibration; import net.sf.samtools.*; @@ -24,6 +49,9 @@ public class TableRecalibrationWalker extends ReadWalker [prevBase x base -> [cycle, qual, new qual]] HashMap cache = new HashMap(); - //@Argument(shortName="serial", doc="", required=false) - //boolean serialRecalibration = false; + public enum RecalibrationMode { + COMBINATORIAL, + JAFFE, + BY_POS_ONLY, + BY_DINUC_ONLY, + SEQUENTIAL, + ERROR + } + + @Argument(shortName="mode", doc="", required=false) + public String modeString = RecalibrationMode.SEQUENTIAL.toString(); + public RecalibrationMode mode = RecalibrationMode.ERROR; private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); private static Pattern COLLAPSED_POS_PATTERN = Pattern.compile("^#\\s+collapsed_pos\\s+(\\w+)"); @@ -40,6 +78,23 @@ public class TableRecalibrationWalker extends ReadWalker data = new ArrayList(); @@ -48,13 +103,12 @@ public class TableRecalibrationWalker extends ReadWalker lines = new xReadLines(new File(paramsFile)).readLines(); for ( String line : lines ) { - //System.out.printf("Reading line %s%n", line); + lineNumber++; if ( HEADER_PATTERN.matcher(line).matches() ) continue; if ( COMMENT_PATTERN.matcher(line).matches() ) { collapsedPos = parseCommentLine(COLLAPSED_POS_PATTERN, line, collapsedPos); collapsedDinuc = parseCommentLine(COLLAPSED_DINUC_PATTERN, line, collapsedDinuc); - //System.out.printf("Collapsed %b %b%n", collapsedPos, collapsedDinuc); } else { data.add(RecalData.fromCSVString(line)); @@ -63,13 +117,14 @@ public class TableRecalibrationWalker extends ReadWalker managers = new HashMap(); for ( String readGroup : readGroups ) { + // create a manager for each read group RecalDataManager manager = new RecalDataManager(readGroup, ! collapsedPos, ! collapsedDinuc); - //RecalDataManager manager = new RecalDataManager(readGroup, maxPos, maxQReported, dinucs.size(), ! collapsedPos, ! collapsedDinuc); managers.put(readGroup, manager); } - // fill in the manager structure + // add the data to their appropriate readGroup managers for ( RecalData datum : data ) { managers.get(datum.readGroup).addDatum(datum); } @@ -111,15 +191,36 @@ public class TableRecalibrationWalker extends ReadWalker new qual + * + * @param manager + * @param dinucs + * @param maxPos + * @param maxQReported + * @return + */ + private RecalMapping initializeMapper( RecalDataManager manager, final boolean rawQempirical, + Set dinucs, int maxPos, int maxQReported ) { + switch ( mode ) { + case COMBINATORIAL: + case JAFFE: + return new CombinatorialRecalMapping(manager, rawQempirical, dinucs, maxPos, maxQReported); + case SEQUENTIAL: + return new SerialRecalMapping(manager, rawQempirical, dinucs, maxPos, maxQReported); + default: + throw new RuntimeException("Unimplemented recalibration mode: " + mode); + } + } + public SAMRecord map(char[] ref, SAMRecord read) { byte[] bases = read.getReadBases(); byte[] quals = read.getBaseQualities(); @@ -132,16 +233,21 @@ public class TableRecalibrationWalker extends ReadWalker %d at %s%n", - // cycle, qual, read.getReadNegativeStrandFlag(), newQual, read.getReadName()); recalQuals[cycle] = newQual; //System.out.printf("Mapping %d => %d%n", qual, newQual); } @@ -160,6 +263,11 @@ public class TableRecalibrationWalker extends ReadWalker cache; RecalDataManager manager; - public CombinatorialRecalMapping(RecalDataManager manager, Set dinucs, int maxPos, int maxQReported ) { + public CombinatorialRecalMapping(RecalDataManager manager, final boolean useRawQempirical, + Set dinucs, int maxPos, int maxQReported ) { this.manager = manager; // initialize the data structure @@ -211,11 +319,12 @@ class CombinatorialRecalMapping implements RecalMapping { for ( RecalData datum : manager.getAll() ) { //System.out.printf("Adding datum %s%n", datum); byte [][] table = cache.get(this.manager.getDinucIndex(datum.dinuc)); - if ( table[datum.pos][datum.qual] != 0 ) + int pos = manager.canonicalPos(datum.pos); + if ( table[pos][datum.qual] != 0 ) throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum)); //table[datum.pos][datum.qual] = (byte)(1 + datum.empiricalQualByte()); - table[datum.pos][datum.qual] = datum.empiricalQualByte(); - // System.out.printf("Binding %d %d => %d%n", datum.pos, datum.qual, datum.empiricalQualByte()); + table[pos][datum.qual] = datum.empiricalQualByte(useRawQempirical); + //System.out.printf("Binding %d %d => %d%n", pos, datum.qual, datum.empiricalQualByte(useRawQempirical)); } } @@ -241,102 +350,154 @@ class CombinatorialRecalMapping implements RecalMapping { } } -/*class CombinatorialRecalMapping implements RecalMapping { - HashMap cache = new HashMap(); + +/** + * Implements a serial recalibration of the reads using the combinational table. First, + * we perform a positional recalibration, and then a subsequent dinuc correction. + * + * Given the full recalibration table, we perform the following preprocessing steps: + * + * - calculate the global quality score shift across all data [DeltaQ] + * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift + * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual + * - The final shift equation is: + * + * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + */ +class SerialRecalMapping implements RecalMapping { + private double globalDeltaQ = 0.0; + private double[][] deltaQPosMap, deltaQDinucMap; + double [] deltaQualMap; + RecalData [][] qPosSupports, qDinucSupports; + + CombinatorialRecalMapping combiMap; RecalDataManager manager; - public CombinatorialRecalMapping(RecalDataManager manager, Set dinucs, int maxPos, int maxQReported ) { + String dinucToLookAt = null; // "CC"; + int posToLookAt = 0; + int qualToLookAt = 25; + + public SerialRecalMapping(RecalDataManager manager, final boolean useRawQempirical, + Set dinucs, int maxPos, int maxQReported ) { + //combiMap = new CombinatorialRecalMapping(manager, dinucs, maxPos, maxQReported ); this.manager = manager; // initialize the data structure - for ( String dinuc : dinucs ) { - byte[][] table = new byte[maxPos+1][maxQReported+1]; - cache.put(dinuc, table); + { + RecalData datum = new RecalData(0, 0, manager.readGroup, "**").inc(manager.getAll()); + double aggregrateQreported = RecalData.combinedQreported(manager.getAll()); + globalDeltaQ = datum.empiricalQualDouble(useRawQempirical) - aggregrateQreported; + System.out.printf("Global quality score shift is %.2f - %.2f = %.2f%n", datum.empiricalQualDouble(useRawQempirical), aggregrateQreported, globalDeltaQ); } for ( RecalData datum : manager.getAll() ) { - //System.out.printf("Adding datum %s%n", datum); - byte [][] table = cache.get(datum.dinuc); - if ( table[datum.pos][datum.qual] != 0 ) - throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum)); - //table[datum.pos][datum.qual] = (byte)(1 + datum.empiricalQualByte()); - table[datum.pos][datum.qual] = datum.empiricalQualByte(); - // System.out.printf("Binding %d %d => %d%n", datum.pos, datum.qual, datum.empiricalQualByte()); + if ( printStateP(datum) ) + System.out.printf("PrintValue: %s%n", datum); } + + // Jaffe-style + deltaQualMap = new double[maxQReported+1]; + for ( RecalData datum : RecalData.sort(manager.combine(true, false, true)) ) { + deltaQualMap[datum.qual] = datum.empiricalQualDouble(useRawQempirical) - datum.qual - globalDeltaQ; + System.out.printf("%s => %s%n", datum, deltaQualMap[datum.qual]); + } + + // calculate the delta Q pos array + deltaQPosMap = new double[maxPos+1][maxQReported+1]; + qPosSupports = new RecalData[maxPos+1][maxQReported+1]; + for ( RecalData datumAtPosQual : manager.combineDinucs() ) { + double offset = globalDeltaQ + deltaQualMap[datumAtPosQual.qual]; + updateCache(qPosSupports, datumAtPosQual, useRawQempirical, deltaQPosMap, datumAtPosQual.pos, datumAtPosQual.qual, offset); + } + + // calculate the delta Q dinuc array + deltaQDinucMap = new double[dinucs.size()+1][maxQReported+1]; + qDinucSupports = new RecalData[dinucs.size()+1][maxQReported+1]; + for ( RecalData datumAtDinucQual : manager.combineCycles() ) { + double offset = globalDeltaQ + deltaQualMap[datumAtDinucQual.qual]; + updateCache(qDinucSupports, datumAtDinucQual, useRawQempirical, deltaQDinucMap, datumAtDinucQual.getDinucIndex(), datumAtDinucQual.qual, offset); + } + + validateTables(maxPos, maxQReported, dinucs.size(), deltaQPosMap, deltaQDinucMap, useRawQempirical, manager.getAll()); } - public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) { - //String dinuc = String.format("%c%c", (char)prevBase, (char)base); - //if ( qual == 2 ) - // System.out.printf("Qual = 2%n"); - byte[] bp = {prevBase, base}; - String dinuc = manager.canonicalDinuc(new String(bp)); - int pos = manager.canonicalPos(cycle); - byte[][] dataTable = cache.get(dinuc); - - if ( dataTable == null && prevBase != 'N' && base != 'N' ) - throw new RuntimeException(String.format("Unmapped data table at %s %s", readGroup, dinuc)); - - byte result = dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual; - - //if ( result == 2 ) - // System.out.printf("Lookup RG=%s dinuc=%s cycle=%d pos=%d qual=%d datatable=%s / %d => %d%n", - // readGroup, dinuc, cycle, pos, qual, dataTable, dataTable.length, result); - - return result; - } -}*/ -/* -class SerialRecalMapping implements RecalMapping { - // mapping from dinuc x Q => new Q - HashMap mappingByDinuc; - - // mapping from pos x Q => new Q - byte[][] mappingByPos; - - public SerialRecalMapping(RecalDataManager manager, Set dinucs, int maxPos, int maxQReported ) { - mappingByDinuc = new HashMap(); - for ( String dinuc : dinucs ) { - byte[] table = new byte[maxQReported+1]; - mappingByDinuc.put(dinuc, table); - } - for ( RecalData datum : manager.getDataByDinuc() ) { - //System.out.printf("Adding datum %s%n", datum); - if ( mappingByDinuc.get(datum.dinuc).length <= datum.qual ) { - throw new RuntimeException(String.format("Unexpectedly massive Q score of %d found, calculated max was %d%n", maxQReported, datum.qual)); + private void validateTables(int maxPos, int maxQReported, int nDinucs, + double[][] deltaQPosMap, double[][] deltaQDinucMap, + final boolean useRawQempirical, List data ) { + for ( int i = 0; i < maxPos; i++ ) { + for ( int j = 0; j < maxQReported; j++ ) { + if ( printStateP(i, null, j) ) + System.out.printf("Mapping: pos=%d qual=%2d delta=%.2f based on %s%n", + i, j, deltaQPosMap[i][j], qPosSupports[i][j]); } - mappingByDinuc.get(datum.dinuc)[datum.qual] = datum.empiricalQualByte(); } - // initialize the mapping by position - mappingByPos = new byte[maxPos+1][maxQReported+1]; - for ( RecalData datum : manager.getDataByPos() ) { - //System.out.printf("Adding datum %s%n", datum); - mappingByPos[datum.pos][datum.qual] = datum.empiricalQualByte(); + for ( int i = 0; i < nDinucs; i++ ) { + for ( int j = 0; j < maxQReported; j++ ) { + String dinuc = RecalData.dinucIndex2bases(i); + if ( printStateP(0, dinuc, j ) ) + System.out.printf("Mapping: dinuc=%s qual=%2d delta=%.2f based on %s%n", + dinuc, j, deltaQDinucMap[i][j], qDinucSupports[i][j]); + } } + + for ( RecalData datum : RecalData.sort(data) ) { + byte newQual = getNewQual(datum.readGroup, (byte)datum.dinuc.charAt(0), (byte)datum.dinuc.charAt(1), datum.pos, (byte)datum.qual); + if ( printStateP( datum ) ) { + System.out.printf("Serial mapping %s => %d => %d vs. %d => delta = %d%n", + datum, newQual, datum.empiricalQualByte(useRawQempirical), datum.empiricalQualByte(), newQual - datum.empiricalQualByte(useRawQempirical)); + } + } + } + + private void updateCache( RecalData[][] supports, RecalData datum, final boolean useRawQempirical, + double[][] table, int i, int j, double meanQ ) { + if ( table[i][j] != 0 ) + throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum)); + double deltaQ = datum.empiricalQualDouble(useRawQempirical) - datum.qual - meanQ; + table[i][j] = deltaQ; + supports[i][j] = datum; + } + + private boolean printStateP( int cycle, String dinuc, int qual ) { + return posToLookAt != -1 && + ( cycle == 0 || posToLookAt == 0 || cycle == posToLookAt ) && + ( dinucToLookAt == null || dinuc == null || manager.getDinucIndex(dinuc) == -1 || dinucToLookAt.equals(dinuc)) && + ( qualToLookAt == 0 || qual == qualToLookAt ); + } + + private boolean printStateP( RecalData datum ) { + return printStateP(datum.pos, datum.dinuc, datum.qual); } public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) { - //System.out.printf("Lookup RG=%s prevBase=%c base=%c cycle=%d qual=%d%n", readGroup, prevBase, base, cycle, qual); - //String dinuc = String.format("%c%c", (char)prevBase, (char)base); - byte[] bp = {prevBase, base}; - String dinuc = new String(bp); + int pos = manager.canonicalPos(cycle); + int index = this.manager.getDinucIndex(prevBase, base); - byte newQualFromDinuc = 0; - byte newQualFromPos = cycle > 0 && cycle < mappingByPos.length ? mappingByPos[cycle][qual] : qual; - byte newQual = newQualFromPos; - if ( prevBase != 'N' && base != 'N' ) { - // if the qual got mapped too high, assume it's the best we've seen for recalibration purposes - int newQualIndex = newQual < mappingByDinuc.get(dinuc).length ? newQual : mappingByDinuc.get(dinuc).length - 1; - newQualFromDinuc = mappingByDinuc.get(dinuc)[newQualIndex]; - newQual = newQualFromDinuc; - } + double deltaQual = deltaQualMap[qual]; + double deltaQPos = pos >= deltaQPosMap.length ? 0.0 : deltaQPosMap[pos][qual]; // == length indices no data for last base + double deltaQDinuc = index == -1 ? 0.0 : deltaQDinucMap[index][qual]; // -1 indices N in prev or current base + double newQual = qual + globalDeltaQ + deltaQual + deltaQPos + deltaQDinuc; + byte newQualByte = QualityUtils.boundQual((int)Math.round(newQual), QualityUtils.MAX_REASONABLE_Q_SCORE); - //System.out.printf("Lookup RG=%s prevBase=%c base=%c cycle=%d qual=%d => %d => %d => %d%n", - // readGroup, prevBase, base, cycle, qual, newQualFromPos, newQualFromDinuc, newQual); + //if ( printStateP(pos, RecalData.dinucIndex2bases(index), qual) ) + // System.out.printf("%s %c%c %d %d => %d + %.2f + %.2f + %.2f + %.2f = %d%n", + // readGroup, prevBase, base, cycle, qual, + // qual, globalDeltaQ, deltaQual, deltaQPos, deltaQDinuc, + // newQualByte); - return newQual; + //byte combiQualByte = combiMap.getNewQual(readGroup, prevBase, base, cycle, qual); + //System.out.printf("%s %c%c %d %d => %d + %.2f + %.2f + %.2f = %d vs %d (%d)%n", + // readGroup, prevBase, base, cycle, qual, + // qual, globalDeltaQ, deltaQPos, deltaQDinuc, + // newQualByte, combiQualByte, newQualByte - combiQualByte); + + if ( newQualByte <= 0 && newQualByte >= QualityUtils.MAX_REASONABLE_Q_SCORE ) + throw new RuntimeException(String.format("Illegal base quality score calculated: %s %c%c %d %d => %d + %.2f + %.2f + %.2f = %d", + readGroup, prevBase, base, cycle, qual, qual, globalDeltaQ, deltaQPos, deltaQDinuc, newQualByte)); + + return newQualByte; } -}*/ +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/java/src/org/broadinstitute/sting/utils/QualityUtils.java index ce684902c..3111612b4 100755 --- a/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -67,6 +67,14 @@ public class QualityUtils { return b; } + static public double phredScaleCorrectRate(double trueRate) { + return phredScaleErrorRate(1-trueRate); + } + + static public double phredScaleErrorRate(double errorRate) { + return -10.0*Math.log10(errorRate); + } + /** * Return a quality score, capped at 63. * @@ -74,7 +82,11 @@ public class QualityUtils { * @return the capped quality score */ static public byte boundQual(int qual) { - return (byte) Math.min(qual, MAX_QUAL_SCORE); + return boundQual(qual, MAX_QUAL_SCORE); + } + + static public byte boundQual(int qual, byte maxQual) { + return (byte) Math.min(qual, maxQual); } /**