Updated and near final version of tabular recalibration system. Uses 'yates' correction for low-occupancy quality bins. Faster and more robust handling of input and output
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1082 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
caf5aef0f8
commit
0a50f2e160
|
|
@ -33,7 +33,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
|
||||
//@Argument(fullName="MAX_READ_GROUPS", shortName="mrg", required=false, doc="Abort if number of read groups in input file exceeeds this count.")
|
||||
//public int MAX_READ_GROUPS = 100;
|
||||
|
||||
|
||||
@Argument(fullName="PLATFORM", shortName="pl", required=false, doc="Only calibrate read groups generated from the given platform (default = * for all platforms)")
|
||||
public List<String> platforms = Collections.singletonList("*");
|
||||
//public List<String> platforms = Collections.singletonList("ILLUMINA");
|
||||
|
|
@ -50,10 +50,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
long counted_bases = 0; // number of bases used to count covariates
|
||||
long skipped_sites = 0; // number of sites skipped because of a dbSNP entry
|
||||
|
||||
PrintStream recalTableOut = null;
|
||||
|
||||
public void initialize() {
|
||||
//if( getToolkit().getEngine().getSAMHeader().getReadGroups().size() > MAX_READ_GROUPS )
|
||||
// Utils.scareUser("Number of read groups in the specified file exceeds the number that can be processed in a reasonable amount of memory." +
|
||||
// "To override this limit, use the --MAX_READ_GROUPS (-mrg) parameter");
|
||||
try {
|
||||
recalTableOut = new PrintStream( OUTPUT_FILEROOT+".recal_data.csv" );
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new RuntimeException("Couldn't open output file", e);
|
||||
}
|
||||
|
||||
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
||||
if( readGroup.getAttribute("PL") == null )
|
||||
|
|
@ -68,12 +72,31 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
out.printf("Created recalibration data collectors for %d read group(s)%n", data.size());
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the particular RecalData datum associated with readGroup, at machine pos, with reported
|
||||
* quality qual, and with the dinuc context of prevBase, base. If an example of such a
|
||||
* base has been seen before, returns the associated RecalData. If not, it creates one, places it in the
|
||||
* system so that subsequent requests will return that object, and returns it.
|
||||
*
|
||||
* @param readGroup
|
||||
* @param pos
|
||||
* @param qual
|
||||
* @param prevBase
|
||||
* @param base
|
||||
* @return
|
||||
*/
|
||||
private RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) {
|
||||
byte[] cs = {(byte)prevBase, (byte)base};
|
||||
String s = new String(cs);
|
||||
return data.get(readGroup).expandingGetRecalData(pos, qual, s, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a list of all of the RecalData associated with readGroup
|
||||
*
|
||||
* @param readGroup
|
||||
* @return
|
||||
*/
|
||||
private List<RecalData> getRecalData(String readGroup) {
|
||||
return data.get(readGroup).getAll();
|
||||
}
|
||||
|
|
@ -110,6 +133,20 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates the recalibration data for the base at offset in the read, associated with readGroup rg.
|
||||
* Correctly handles machine orientation of the read. I.e., it adds data not by offset in the read
|
||||
* but by implied machine cycle associated with the offset.
|
||||
*
|
||||
* TODO: this whole system is 0-based and therefore inconsisent with the rest of the GATK, where pos is 1-based
|
||||
* TODO: and offset is 0-based. How very annoying.
|
||||
*
|
||||
* @param rg
|
||||
* @param read
|
||||
* @param offset
|
||||
* @param ref
|
||||
* @return
|
||||
*/
|
||||
private int updateDataFromRead( String rg, SAMRecord read, int offset, char ref ) {
|
||||
int cycle = offset;
|
||||
byte[] bases = read.getReadBases();
|
||||
|
|
@ -126,13 +163,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
int qual = quals[offset];
|
||||
if ( qual > 0 ) { // && qual <= QualityUtils.MAX_QUAL_SCORE ) {
|
||||
// previous base is the next base in terms of machine chemistry if this is a negative strand
|
||||
// if ( qual == 2 )
|
||||
//if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") )
|
||||
// System.out.printf("Adding neg?=%b b_offset=%c offset=%d cycle=%d qual=%d dinuc=%c%c ref_match=%c comp=%c name=%s%n",
|
||||
// read.getReadNegativeStrandFlag(), (char)read.getReadBases()[offset], offset, cycle, qual, prevBase, base,
|
||||
// ref, (char)BaseUtils.simpleComplement(ref), read.getReadName());
|
||||
if ( qual > 0 ) {
|
||||
RecalData datum = getRecalData(rg, cycle, qual, prevBase, base);
|
||||
if (datum != null) datum.inc(base,ref);
|
||||
return 1;
|
||||
|
|
@ -153,6 +184,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
//out.printf("...done%n");
|
||||
}
|
||||
|
||||
/**
|
||||
* Prints some basic information about the CountCovariates run to the output stream out
|
||||
* @param out
|
||||
*/
|
||||
private void printInfo(PrintStream out) {
|
||||
out.printf("# date %s%n", new Date());
|
||||
out.printf("# collapsed_pos %b%n", collapsePos);
|
||||
|
|
@ -163,6 +198,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
out.printf("# fraction_skipped 1/%.0f%n", (double)counted_sites / skipped_sites);
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
private void writeLogisticRecalibrationTable() {
|
||||
PrintStream dinuc_out = null;
|
||||
try {
|
||||
|
|
@ -189,36 +225,28 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
}
|
||||
}
|
||||
|
||||
private void writeRecalTable() {
|
||||
PrintStream table_out = null;
|
||||
try {
|
||||
table_out = new PrintStream( OUTPUT_FILEROOT+".recal_data.csv");
|
||||
printInfo(table_out);
|
||||
table_out.println("rg,dn,Qrep,pos,NBases,MMismatches,Qemp");
|
||||
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
||||
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
|
||||
if ( datum.N > 0 )
|
||||
table_out.format("%s%n", datum.toCSVString(collapsePos));
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException e ) {
|
||||
System.err.println("FileNotFoundException: " + e.getMessage());
|
||||
}
|
||||
finally {
|
||||
if (table_out != null) table_out.close();
|
||||
}
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public Integer reduce(Integer a, Integer b) {
|
||||
return 0;
|
||||
/**
|
||||
* Writes out the key recalibration data collected from the reads. Dumps this recalibration data
|
||||
* as a CVS string to the recalTableOut PrintStream. Emits the data for all read groups into this file.
|
||||
*/
|
||||
private void writeRecalTable() {
|
||||
printInfo(recalTableOut);
|
||||
recalTableOut.println("rg,pos,Qrep,dn,nBases,nMismatches,Qemp");
|
||||
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
||||
// TODO: should sort the data coming out of getRecalData here for easier processing
|
||||
for ( RecalData datum: RecalData.sort(getRecalData(readGroup.getReadGroupId())) ) {
|
||||
if ( datum.N > 0 )
|
||||
recalTableOut.format("%s%n", datum.toCSVString(collapsePos));
|
||||
}
|
||||
}
|
||||
recalTableOut.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Check to see whether this read group should be processed.
|
||||
* Check to see whether this read group should be processed. Returns true if the
|
||||
* read group is in the list of platforms to process or the platform == *, indicating
|
||||
* that all platforms should be processed.
|
||||
*
|
||||
* @param readGroup
|
||||
* @return
|
||||
*/
|
||||
|
|
@ -233,4 +261,24 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* No initialization routines
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Doesn't do anything
|
||||
*
|
||||
* @param a
|
||||
* @param b
|
||||
* @return
|
||||
*/
|
||||
public Integer reduce(Integer a, Integer b) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,14 +3,25 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
|
|||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
||||
public class RecalData {
|
||||
long N;
|
||||
long B;
|
||||
int pos;
|
||||
int qual;
|
||||
String readGroup;
|
||||
String dinuc;
|
||||
import java.util.List;
|
||||
import java.util.Collections;
|
||||
|
||||
public class RecalData implements Comparable<RecalData> {
|
||||
//
|
||||
// context of this data -- read group, position in read, reported quality, and dinuc
|
||||
//
|
||||
String readGroup; // the read group where the datum was observed
|
||||
int pos; // the position of this recalibration datum
|
||||
int qual; // the reported quality scoree of this datum
|
||||
String dinuc; // the dinucleotide context of this datum
|
||||
|
||||
//
|
||||
// empirical quality score data
|
||||
//
|
||||
long N; // total number of observed bases
|
||||
long B; // number of observed mismatches to the reference base
|
||||
|
||||
|
||||
public RecalData(int pos, int qual, String readGroup, String dinuc ) {
|
||||
this.pos = pos;
|
||||
this.qual = qual;
|
||||
|
|
@ -18,61 +29,164 @@ public class RecalData {
|
|||
this.dinuc = dinuc;
|
||||
}
|
||||
|
||||
public void inc(long incN, long incB) {
|
||||
public RecalData( RecalData copy ) {
|
||||
this.pos = copy.pos;
|
||||
this.qual = copy.qual;
|
||||
this.readGroup = copy.readGroup;
|
||||
this.dinuc = copy.dinuc;
|
||||
this.N = copy.N;
|
||||
this.B = copy.B;
|
||||
}
|
||||
|
||||
public int compareTo(RecalData to) {
|
||||
int cmpReadGroup = readGroup.compareTo(to.readGroup);
|
||||
if (cmpReadGroup != 0) return cmpReadGroup;
|
||||
|
||||
int cmpPos = new Integer(pos).compareTo(to.pos);
|
||||
if (cmpPos != 0) return cmpPos;
|
||||
|
||||
int cmpQual = new Integer(qual).compareTo(to.qual);
|
||||
if (cmpQual != 0) return cmpQual;
|
||||
|
||||
return dinuc.compareTo(to.dinuc);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the expected number of mismatching bases for this datum, using the number of bases N and
|
||||
* the reported qual score. Effectively qual implied error rate * number of bases.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double getExpectedErrors() {
|
||||
return QualityUtils.qualToErrorProb((byte)qual) * N;
|
||||
}
|
||||
|
||||
//
|
||||
// Increment routines
|
||||
//
|
||||
public RecalData inc(long incN, long incB) {
|
||||
N += incN;
|
||||
B += incB;
|
||||
return this;
|
||||
}
|
||||
|
||||
//public int getDinucIndex() {
|
||||
// return string2dinucIndex(this.dinuc);
|
||||
//}
|
||||
/**
|
||||
* Add N and B from other to self
|
||||
*
|
||||
* @param other
|
||||
* @return
|
||||
*/
|
||||
public RecalData inc(final RecalData other) {
|
||||
return inc(other.N, other.B);
|
||||
}
|
||||
|
||||
public void inc(char curBase, char ref) {
|
||||
inc(1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1);
|
||||
/**
|
||||
* Add all of the Ns and Bs for data to self
|
||||
*
|
||||
* @param data
|
||||
* @return
|
||||
*/
|
||||
public RecalData inc(List<RecalData> data) {
|
||||
for ( RecalData other : data ) {
|
||||
this.inc(other);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Count an empirical observation of our current base against the reference base.
|
||||
* Increments N and, if curBase != ref, also increments B
|
||||
*
|
||||
* @param curBase
|
||||
* @param ref
|
||||
* @return
|
||||
*/
|
||||
public RecalData inc(char curBase, char ref) {
|
||||
return inc(1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1);
|
||||
//out.printf("%s %s\n", curBase, ref);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the integer dinuc index associated with this datum. Can return -1 if dinuc
|
||||
* contains unexpected bases. Returns 0 if dinuc contains the * operator (not tracking dinucs)
|
||||
* @return
|
||||
*/
|
||||
public int getDinucIndex() {
|
||||
return dinucIndex(this.dinuc);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the empirical quality score for this datum from B and N. Bounds the result
|
||||
* by QualityUtils.MAX_REASONABLE_Q_SCORE.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double empiricalQualDouble(final boolean useRawQempirical) {
|
||||
double doubleB = useRawQempirical ? B : B + 1;
|
||||
double doubleN = useRawQempirical ? N : N + 1;
|
||||
double empiricalQual = -10 * Math.log10(doubleB / doubleN);
|
||||
if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE;
|
||||
return empiricalQual;
|
||||
}
|
||||
public double empiricalQualDouble() { return empiricalQualDouble(true); }
|
||||
|
||||
|
||||
/**
|
||||
* As as empiricalQualDouble, but rounded and encoded as a byte for placement in a read quality array of bytes
|
||||
* @return
|
||||
*/
|
||||
public byte empiricalQualByte(final boolean useRawQempirical) {
|
||||
double doubleB = useRawQempirical ? B : B + 1;
|
||||
double doubleN = useRawQempirical ? N : N + 1;
|
||||
return QualityUtils.probToQual(1.0 - doubleB / doubleN);
|
||||
}
|
||||
public byte empiricalQualByte() { return empiricalQualByte(true); }
|
||||
|
||||
|
||||
public static String headerString() {
|
||||
return ("pos, rg, dinuc, qual, emp_qual, qual_diff, n, b");
|
||||
}
|
||||
|
||||
public double empiricalQualDouble() {
|
||||
double empiricalQual = -10 * Math.log10((double)B / N);
|
||||
if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE;
|
||||
return empiricalQual;
|
||||
}
|
||||
|
||||
public byte empiricalQualByte() {
|
||||
return QualityUtils.probToQual(1.0 - (double)B / N);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
double empiricalQual = empiricalQualDouble();
|
||||
return String.format("%3d,%s,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, readGroup, dinuc, qual, empiricalQual, qual-empiricalQual, N, B);
|
||||
return String.format("[rg=%s, pos=%3d, Qrep=%3d, dinuc=%s, Qemp=%2.2f / %2.2f, B=%d, N=%d]", readGroup, pos, qual, dinuc, empiricalQualDouble(), empiricalQualDouble(false), B, N);
|
||||
//return String.format("%3d,%s,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, readGroup, dinuc, qual, empiricalQual, qual-empiricalQual, N, B);
|
||||
}
|
||||
|
||||
//
|
||||
// To and from CSV strings
|
||||
//
|
||||
public String toCSVString(boolean collapsedPos) {
|
||||
return String.format("%s,%s,%d,%s,%d,%d,%d", readGroup, dinuc, qual, collapsedPos ? "*" : pos, N, B, empiricalQualByte());
|
||||
// rg,pos,Qrep,dn,nBases,nMismatches,Qemp
|
||||
return String.format("%s,%s,%d,%s,%d,%d,%d", readGroup, collapsedPos ? "*" : pos, qual, dinuc, N, B, empiricalQualByte());
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the string s and returns the encoded RecalData in it. Assumes data has been emitted by
|
||||
* toCSVString() routine. Order is readGroup, dinuc, qual, pos, N, B, Qemp as byte
|
||||
* @param s
|
||||
* @return
|
||||
*/
|
||||
public static RecalData fromCSVString(String s) {
|
||||
String[] vals = s.split(",");
|
||||
String rg = vals[0];
|
||||
String dinuc = vals[1];
|
||||
int pos = vals[1].equals("*") ? 0 : Integer.parseInt(vals[1]);
|
||||
int qual = Integer.parseInt(vals[2]);
|
||||
|
||||
int pos = vals[3].equals("*") ? 0 : Integer.parseInt(vals[3]);
|
||||
String dinuc = vals[3];
|
||||
int N = Integer.parseInt(vals[4]);
|
||||
int B = Integer.parseInt(vals[5]);
|
||||
RecalData datum = new RecalData(pos, qual, rg, dinuc);
|
||||
datum.B = B;
|
||||
datum.N = N;
|
||||
|
||||
//if ( datum.N > 0 ) System.out.printf("Parsing line [%s] => [%s]%n", s, datum);
|
||||
|
||||
return datum;
|
||||
}
|
||||
|
||||
|
||||
public static List<RecalData> sort(List<RecalData> l) {
|
||||
Collections.sort(l);
|
||||
return l;
|
||||
}
|
||||
|
||||
public static int bases2dinucIndex(char prevBase, char base) {
|
||||
int pbI = BaseUtils.simpleBaseToBaseIndex(prevBase);
|
||||
int bI = BaseUtils.simpleBaseToBaseIndex(base);
|
||||
|
|
@ -93,25 +207,17 @@ public class RecalData {
|
|||
return bases2dinucIndex((char)prevBase, (char)base);
|
||||
}
|
||||
|
||||
//
|
||||
// private static int nuc2num[];
|
||||
// private static char num2nuc[];
|
||||
//
|
||||
// static {
|
||||
// nuc2num = new int[128];
|
||||
// nuc2num['A'] = 0;
|
||||
// nuc2num['C'] = 1;
|
||||
// nuc2num['G'] = 2;
|
||||
// nuc2num['T'] = 3;
|
||||
// nuc2num['a'] = 0;
|
||||
// nuc2num['c'] = 1;
|
||||
// nuc2num['g'] = 2;
|
||||
// nuc2num['t'] = 3;
|
||||
//
|
||||
// num2nuc = new char[4];
|
||||
// num2nuc[0] = 'A';
|
||||
// num2nuc[1] = 'C';
|
||||
// num2nuc[2] = 'G';
|
||||
// num2nuc[3] = 'T';
|
||||
// }
|
||||
public static double combinedQreported(List<RecalData> l) {
|
||||
double sumExpectedErrors = 0;
|
||||
long nBases = 0;
|
||||
for ( RecalData datum : l ) {
|
||||
sumExpectedErrors += datum.getExpectedErrors();
|
||||
nBases += datum.N;
|
||||
}
|
||||
|
||||
double q = QualityUtils.phredScaleErrorRate(sumExpectedErrors / nBases);
|
||||
System.out.printf("expected errors=%f, nBases = %d, rate=%f, qual=%f%n",
|
||||
sumExpectedErrors, nBases, 1 - sumExpectedErrors / nBases, q);
|
||||
return q;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,6 +1,5 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.ExpandingArrayList;
|
||||
|
||||
import java.util.*;
|
||||
|
|
@ -64,15 +63,19 @@ public class RecalDataManager {
|
|||
throw new RuntimeException(String.format("BUG: adding incorrect read group datum %s to RecalDataManager for %s", datum.readGroup, this.readGroup));
|
||||
}
|
||||
|
||||
if ( getRecalData(datum.pos, datum.qual, datum.dinuc) != null )
|
||||
throw new RuntimeException(String.format("Duplicate entry discovered: %s vs. %s", getRecalData(datum.pos, datum.qual, datum.dinuc), datum));
|
||||
RecalData prev = getRecalData(datum.pos, datum.qual, datum.dinuc);
|
||||
if ( prev != null ) {
|
||||
if ( trackDinuc && trackPos )
|
||||
throw new RuntimeException(String.format("Duplicate entry discovered: %s vs. %s", getRecalData(datum.pos, datum.qual, datum.dinuc), datum));
|
||||
prev.inc(datum);
|
||||
} else {
|
||||
int posIndex = getPosIndex(datum.pos);
|
||||
int internalDinucIndex = getDinucIndex(datum.dinuc);
|
||||
if ( internalDinucIndex == -1 ) return;
|
||||
|
||||
int posIndex = getPosIndex(datum.pos);
|
||||
int internalDinucIndex = getDinucIndex(datum.dinuc);
|
||||
if ( internalDinucIndex == -1 ) return;
|
||||
|
||||
set(posIndex, datum.qual, internalDinucIndex, datum);
|
||||
flattenData.add(datum);
|
||||
set(posIndex, datum.qual, internalDinucIndex, datum);
|
||||
flattenData.add(datum);
|
||||
}
|
||||
}
|
||||
|
||||
public RecalData getRecalData(int pos, int qual, String dinuc) {
|
||||
|
|
@ -157,47 +160,41 @@ public class RecalDataManager {
|
|||
|
||||
return l;
|
||||
}
|
||||
*/
|
||||
|
||||
public List<RecalData> getDataByPos() {
|
||||
List<RecalData> l = new ArrayList<RecalData>(data.length);
|
||||
for ( int pos = 0; pos < maxReadLen; pos++ ) {
|
||||
for ( int qual = 0; qual < QualityUtils.MAX_QUAL_SCORE+1; qual++ ) {
|
||||
RecalData datum = new RecalData(pos, qual, readGroup, "**");
|
||||
for ( int dinucIndex = 0; dinucIndex < nDinucs; dinucIndex++ ) {
|
||||
RecalData datum2 = getRecalData(pos, qual, RecalData.dinucIndex2bases(dinucIndex));
|
||||
if ( datum2 != null )
|
||||
datum.inc(data[pos][qual][dinucIndex].N, data[pos][qual][dinucIndex].B);
|
||||
}
|
||||
if ( datum.N > 0 ) l.add(datum);
|
||||
private RecalData getMatchingDatum(List<RecalData> l, RecalData datum,
|
||||
boolean combinePos, boolean combineQual, boolean combineDinuc) {
|
||||
for ( RecalData find : l ) {
|
||||
if ( (combineQual || find.qual == datum.qual) &&
|
||||
(combinePos || find.pos == datum.pos ) &&
|
||||
(combineDinuc || find.dinuc.equals(datum.dinuc)) ) {
|
||||
//System.out.printf("Found %s for %s%n", find, datum);
|
||||
return find;
|
||||
}
|
||||
}
|
||||
|
||||
System.out.printf("getDataByPos => %d%n", l.size());
|
||||
return l;
|
||||
RecalData d = new RecalData(combinePos ? 0 : datum.pos, combineQual ? 0 : datum.qual, datum.readGroup, combineDinuc ? "**" : datum.dinuc );
|
||||
//System.out.printf("Making new match %s%n", d);
|
||||
l.add(d);
|
||||
return d;
|
||||
}
|
||||
|
||||
public List<RecalData> getDataByDinuc() {
|
||||
List<RecalData> l = new ArrayList<RecalData>(nDinucs);
|
||||
public List<RecalData> combineDinucs() {
|
||||
return combine(false, false, true);
|
||||
}
|
||||
|
||||
for ( int dinucIndex = 0; dinucIndex < nDinucs; dinucIndex++ ) {
|
||||
for ( int qual = 0; qual < QualityUtils.MAX_QUAL_SCORE+1; qual++ ) {
|
||||
RecalData datum = new RecalData(-1, qual, readGroup, RecalData.dinucIndex2bases(dinucIndex));
|
||||
System.out.printf("Aggregating [%s]:%n", datum);
|
||||
for ( int pos = 0; pos < data.length; pos++ ) {
|
||||
RecalData datum2 = getRecalData(pos, qual, RecalData.dinucIndex2bases(dinucIndex));
|
||||
if ( datum2 != null ) {
|
||||
System.out.printf(" + [%s]:%n", datum2);
|
||||
datum.inc(data[pos][qual][dinucIndex].N, data[pos][qual][dinucIndex].B);
|
||||
}
|
||||
}
|
||||
if ( datum.N > 0 ) l.add(datum);
|
||||
System.out.printf(" %s [%s]:%n", datum.N > 0 ? "=>" : "<>", datum);
|
||||
}
|
||||
public List<RecalData> combineCycles() {
|
||||
return combine(true, false, false);
|
||||
}
|
||||
|
||||
public List<RecalData> combine(boolean ignorePos, boolean ignoreQual, boolean ignoreDinuc) {
|
||||
List<RecalData> l = new ArrayList<RecalData>();
|
||||
for ( RecalData datum : flattenData ) {
|
||||
RecalData reduced = getMatchingDatum(l, datum, ignorePos, ignoreQual, ignoreDinuc);
|
||||
//System.out.printf("Combining %s with %s%n", datum, reduced);
|
||||
reduced.inc(datum);
|
||||
}
|
||||
|
||||
System.out.printf("getDataByDinuc => %d%n", l.size());
|
||||
return l;
|
||||
}*/
|
||||
}
|
||||
|
||||
public List<RecalData> getAll() {
|
||||
return flattenData;
|
||||
|
|
|
|||
|
|
@ -1,3 +1,28 @@
|
|||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
|
|
@ -24,6 +49,9 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
@Argument(shortName="outputBAM", doc="output BAM file", required=false)
|
||||
public String outputBamFile = null;
|
||||
|
||||
@Argument(shortName="rawQempirical", doc="If provide, we will use raw Qempirical scores calculated from the # mismatches and # bases, rather than the more conservative estimate of # mismatches + 1 / # bases + 1", required=false)
|
||||
public boolean rawQempirical = false;
|
||||
|
||||
private static Logger logger = Logger.getLogger(TableRecalibrationWalker.class);
|
||||
|
||||
private final static boolean DEBUG = false;
|
||||
|
|
@ -31,8 +59,18 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
// maps from [readGroup] -> [prevBase x base -> [cycle, qual, new qual]]
|
||||
HashMap<String, RecalMapping> cache = new HashMap<String, RecalMapping>();
|
||||
|
||||
//@Argument(shortName="serial", doc="", required=false)
|
||||
//boolean serialRecalibration = false;
|
||||
public enum RecalibrationMode {
|
||||
COMBINATORIAL,
|
||||
JAFFE,
|
||||
BY_POS_ONLY,
|
||||
BY_DINUC_ONLY,
|
||||
SEQUENTIAL,
|
||||
ERROR
|
||||
}
|
||||
|
||||
@Argument(shortName="mode", doc="", required=false)
|
||||
public String modeString = RecalibrationMode.SEQUENTIAL.toString();
|
||||
public RecalibrationMode mode = RecalibrationMode.ERROR;
|
||||
|
||||
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private static Pattern COLLAPSED_POS_PATTERN = Pattern.compile("^#\\s+collapsed_pos\\s+(\\w+)");
|
||||
|
|
@ -40,6 +78,23 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
private static Pattern HEADER_PATTERN = Pattern.compile("^rg.*");
|
||||
|
||||
public void initialize() {
|
||||
//
|
||||
// crap hack until Enum arg types are supported
|
||||
//
|
||||
for ( RecalibrationMode potential : RecalibrationMode.values() ) {
|
||||
if ( potential.toString().equals(modeString)) {
|
||||
mode = potential;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( mode == RecalibrationMode.ERROR )
|
||||
throw new RuntimeException("Unknown mode requested: " + modeString);
|
||||
|
||||
|
||||
//
|
||||
// Walk over the data file, parsing it into recalibration data
|
||||
//
|
||||
int lineNumber = 0;
|
||||
try {
|
||||
System.out.printf("Reading data...%n");
|
||||
List<RecalData> data = new ArrayList<RecalData>();
|
||||
|
|
@ -48,13 +103,12 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
|
||||
List<String> lines = new xReadLines(new File(paramsFile)).readLines();
|
||||
for ( String line : lines ) {
|
||||
//System.out.printf("Reading line %s%n", line);
|
||||
lineNumber++;
|
||||
if ( HEADER_PATTERN.matcher(line).matches() )
|
||||
continue;
|
||||
if ( COMMENT_PATTERN.matcher(line).matches() ) {
|
||||
collapsedPos = parseCommentLine(COLLAPSED_POS_PATTERN, line, collapsedPos);
|
||||
collapsedDinuc = parseCommentLine(COLLAPSED_DINUC_PATTERN, line, collapsedDinuc);
|
||||
//System.out.printf("Collapsed %b %b%n", collapsedPos, collapsedDinuc);
|
||||
}
|
||||
else {
|
||||
data.add(RecalData.fromCSVString(line));
|
||||
|
|
@ -63,13 +117,14 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
initializeCache(data, collapsedPos, collapsedDinuc);
|
||||
} catch ( FileNotFoundException e ) {
|
||||
Utils.scareUser("Cannot read/find parameters file " + paramsFile);
|
||||
} catch ( NumberFormatException e ) {
|
||||
throw new RuntimeException("Error parsing recalibration data at line " + lineNumber, e);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean parseCommentLine(Pattern pat, String line, boolean flag) {
|
||||
Matcher m = pat.matcher(line);
|
||||
if ( m.matches() ) {
|
||||
//System.out.printf("Parsing %s%n", m.group(1));
|
||||
flag = Boolean.parseBoolean(m.group(1));
|
||||
}
|
||||
|
||||
|
|
@ -95,15 +150,40 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
logger.info(String.format("Max pos : %d", maxPos));
|
||||
logger.info(String.format("Max Q reported : %d", maxQReported));
|
||||
|
||||
// initialize the data structure
|
||||
// check the mode
|
||||
switch ( mode ) {
|
||||
case JAFFE:
|
||||
collapsedPos = collapsedDinuc = true;
|
||||
break;
|
||||
case BY_POS_ONLY:
|
||||
if ( collapsedPos )
|
||||
throw new RuntimeException(String.format("Cannot perform position_only recalibration -- data is already partially collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
|
||||
collapsedPos = true;
|
||||
break;
|
||||
case BY_DINUC_ONLY:
|
||||
if ( collapsedDinuc )
|
||||
throw new RuntimeException(String.format("Cannot perform dinuc_only recalibration -- data is already partially collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
|
||||
collapsedDinuc = true;
|
||||
break;
|
||||
case COMBINATORIAL:
|
||||
if ( collapsedPos || collapsedDinuc )
|
||||
throw new RuntimeException(String.format("Cannot perform combinatorial recalibration -- data is already collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
|
||||
case SEQUENTIAL:
|
||||
if ( collapsedPos || collapsedDinuc )
|
||||
throw new RuntimeException(String.format("Cannot perform sequential recalibration -- data is already collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
|
||||
}
|
||||
|
||||
//
|
||||
// initialize the data structures
|
||||
//
|
||||
HashMap<String, RecalDataManager> managers = new HashMap<String, RecalDataManager>();
|
||||
for ( String readGroup : readGroups ) {
|
||||
// create a manager for each read group
|
||||
RecalDataManager manager = new RecalDataManager(readGroup, ! collapsedPos, ! collapsedDinuc);
|
||||
//RecalDataManager manager = new RecalDataManager(readGroup, maxPos, maxQReported, dinucs.size(), ! collapsedPos, ! collapsedDinuc);
|
||||
managers.put(readGroup, manager);
|
||||
}
|
||||
|
||||
// fill in the manager structure
|
||||
// add the data to their appropriate readGroup managers
|
||||
for ( RecalData datum : data ) {
|
||||
managers.get(datum.readGroup).addDatum(datum);
|
||||
}
|
||||
|
|
@ -111,15 +191,36 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
// fill in the table with mapping objects
|
||||
for ( String readGroup : readGroups ) {
|
||||
RecalDataManager manager = managers.get(readGroup);
|
||||
RecalMapping mapper = null;
|
||||
//if ( serialRecalibration )
|
||||
// mapper = new SerialRecalMapping(manager, dinucs, maxPos, maxQReported);
|
||||
//else
|
||||
mapper = new CombinatorialRecalMapping(manager, dinucs, maxPos, maxQReported);
|
||||
RecalMapping mapper = initializeMapper(manager, rawQempirical, dinucs, maxPos, maxQReported);
|
||||
logger.info(String.format("Creating mapper for %s of class %s", readGroup, mapper));
|
||||
cache.put(readGroup, mapper);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a new RecalMapping object appropriate for the requested mode in this.mode for the RecalData
|
||||
* in manager. Uses the dinucs set dinucs, maxPos, and maxQReported to build efficient lookup structures
|
||||
* for mapping from old qual and features -> new qual
|
||||
*
|
||||
* @param manager
|
||||
* @param dinucs
|
||||
* @param maxPos
|
||||
* @param maxQReported
|
||||
* @return
|
||||
*/
|
||||
private RecalMapping initializeMapper( RecalDataManager manager, final boolean rawQempirical,
|
||||
Set<String> dinucs, int maxPos, int maxQReported ) {
|
||||
switch ( mode ) {
|
||||
case COMBINATORIAL:
|
||||
case JAFFE:
|
||||
return new CombinatorialRecalMapping(manager, rawQempirical, dinucs, maxPos, maxQReported);
|
||||
case SEQUENTIAL:
|
||||
return new SerialRecalMapping(manager, rawQempirical, dinucs, maxPos, maxQReported);
|
||||
default:
|
||||
throw new RuntimeException("Unimplemented recalibration mode: " + mode);
|
||||
}
|
||||
}
|
||||
|
||||
public SAMRecord map(char[] ref, SAMRecord read) {
|
||||
byte[] bases = read.getReadBases();
|
||||
byte[] quals = read.getBaseQualities();
|
||||
|
|
@ -132,16 +233,21 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
|
||||
byte[] recalQuals = recalibrateBasesAndQuals(read.getAttribute("RG").toString(), bases, quals);
|
||||
|
||||
if (read.getReadNegativeStrandFlag())
|
||||
if (read.getReadNegativeStrandFlag()) // reverse the quals for the neg strand read
|
||||
recalQuals = BaseUtils.reverse(recalQuals);
|
||||
//if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") )
|
||||
// System.out.printf("OLD: %s%n", read.format());
|
||||
read.setBaseQualities(recalQuals);
|
||||
//if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") )
|
||||
// System.out.printf("NEW: %s%n", read.format());
|
||||
return read;
|
||||
}
|
||||
|
||||
/**
|
||||
* Workhorse routine. Given a read group and an array of bases and quals, returns a new set of recalibrated
|
||||
* qualities for each base/qual in bases and quals. Uses the RecalMapping object associated with readGroup.
|
||||
*
|
||||
* @param readGroup
|
||||
* @param bases
|
||||
* @param quals
|
||||
* @return
|
||||
*/
|
||||
public byte[] recalibrateBasesAndQuals(final String readGroup, byte[] bases, byte[] quals) {
|
||||
byte[] recalQuals = new byte[quals.length];
|
||||
RecalMapping mapper = cache.get(readGroup);
|
||||
|
|
@ -150,9 +256,6 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
for ( int cycle = 1; cycle < bases.length; cycle++ ) { // skip first and last base, qual already set because no dinuc
|
||||
byte qual = quals[cycle];
|
||||
byte newQual = mapper.getNewQual(readGroup, bases[cycle - 1], bases[cycle], cycle, qual);
|
||||
//if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") )
|
||||
// System.out.printf("Processing cycle=%d qual=%d: neg?=%b => %d at %s%n",
|
||||
// cycle, qual, read.getReadNegativeStrandFlag(), newQual, read.getReadName());
|
||||
recalQuals[cycle] = newQual;
|
||||
//System.out.printf("Mapping %d => %d%n", qual, newQual);
|
||||
}
|
||||
|
|
@ -160,6 +263,11 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
return recalQuals;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Standard I/O routines
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
public void onTraversalDone(SAMFileWriter output) {
|
||||
if ( output != null ) {
|
||||
output.close();
|
||||
|
|
@ -167,7 +275,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
}
|
||||
|
||||
public SAMFileWriter reduceInit() {
|
||||
if ( outputBamFile != null ) { // ! outputBamFile.equals("") ) {
|
||||
if ( outputBamFile != null ) {
|
||||
SAMFileHeader header = this.getToolkit().getEngine().getSAMHeader();
|
||||
return Utils.createSAMFileWriterWithCompression(header, true, outputBamFile, getToolkit().getBAMCompression());
|
||||
}
|
||||
|
|
@ -177,8 +285,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
}
|
||||
|
||||
/**
|
||||
* Summarize the error rate data.
|
||||
*
|
||||
* Write out the read
|
||||
*/
|
||||
public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) {
|
||||
if ( output != null ) {
|
||||
|
|
@ -199,7 +306,8 @@ class CombinatorialRecalMapping implements RecalMapping {
|
|||
ArrayList<byte[][]> cache;
|
||||
RecalDataManager manager;
|
||||
|
||||
public CombinatorialRecalMapping(RecalDataManager manager, Set<String> dinucs, int maxPos, int maxQReported ) {
|
||||
public CombinatorialRecalMapping(RecalDataManager manager, final boolean useRawQempirical,
|
||||
Set<String> dinucs, int maxPos, int maxQReported ) {
|
||||
this.manager = manager;
|
||||
|
||||
// initialize the data structure
|
||||
|
|
@ -211,11 +319,12 @@ class CombinatorialRecalMapping implements RecalMapping {
|
|||
for ( RecalData datum : manager.getAll() ) {
|
||||
//System.out.printf("Adding datum %s%n", datum);
|
||||
byte [][] table = cache.get(this.manager.getDinucIndex(datum.dinuc));
|
||||
if ( table[datum.pos][datum.qual] != 0 )
|
||||
int pos = manager.canonicalPos(datum.pos);
|
||||
if ( table[pos][datum.qual] != 0 )
|
||||
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
|
||||
//table[datum.pos][datum.qual] = (byte)(1 + datum.empiricalQualByte());
|
||||
table[datum.pos][datum.qual] = datum.empiricalQualByte();
|
||||
// System.out.printf("Binding %d %d => %d%n", datum.pos, datum.qual, datum.empiricalQualByte());
|
||||
table[pos][datum.qual] = datum.empiricalQualByte(useRawQempirical);
|
||||
//System.out.printf("Binding %d %d => %d%n", pos, datum.qual, datum.empiricalQualByte(useRawQempirical));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -241,102 +350,154 @@ class CombinatorialRecalMapping implements RecalMapping {
|
|||
}
|
||||
}
|
||||
|
||||
/*class CombinatorialRecalMapping implements RecalMapping {
|
||||
HashMap<String, byte[][]> cache = new HashMap<String, byte[][]>();
|
||||
|
||||
/**
|
||||
* Implements a serial recalibration of the reads using the combinational table. First,
|
||||
* we perform a positional recalibration, and then a subsequent dinuc correction.
|
||||
*
|
||||
* Given the full recalibration table, we perform the following preprocessing steps:
|
||||
*
|
||||
* - calculate the global quality score shift across all data [DeltaQ]
|
||||
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
|
||||
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
|
||||
* - The final shift equation is:
|
||||
*
|
||||
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc)
|
||||
*/
|
||||
class SerialRecalMapping implements RecalMapping {
|
||||
private double globalDeltaQ = 0.0;
|
||||
private double[][] deltaQPosMap, deltaQDinucMap;
|
||||
double [] deltaQualMap;
|
||||
RecalData [][] qPosSupports, qDinucSupports;
|
||||
|
||||
CombinatorialRecalMapping combiMap;
|
||||
RecalDataManager manager;
|
||||
|
||||
public CombinatorialRecalMapping(RecalDataManager manager, Set<String> dinucs, int maxPos, int maxQReported ) {
|
||||
String dinucToLookAt = null; // "CC";
|
||||
int posToLookAt = 0;
|
||||
int qualToLookAt = 25;
|
||||
|
||||
public SerialRecalMapping(RecalDataManager manager, final boolean useRawQempirical,
|
||||
Set<String> dinucs, int maxPos, int maxQReported ) {
|
||||
//combiMap = new CombinatorialRecalMapping(manager, dinucs, maxPos, maxQReported );
|
||||
this.manager = manager;
|
||||
|
||||
// initialize the data structure
|
||||
for ( String dinuc : dinucs ) {
|
||||
byte[][] table = new byte[maxPos+1][maxQReported+1];
|
||||
cache.put(dinuc, table);
|
||||
{
|
||||
RecalData datum = new RecalData(0, 0, manager.readGroup, "**").inc(manager.getAll());
|
||||
double aggregrateQreported = RecalData.combinedQreported(manager.getAll());
|
||||
globalDeltaQ = datum.empiricalQualDouble(useRawQempirical) - aggregrateQreported;
|
||||
System.out.printf("Global quality score shift is %.2f - %.2f = %.2f%n", datum.empiricalQualDouble(useRawQempirical), aggregrateQreported, globalDeltaQ);
|
||||
}
|
||||
|
||||
for ( RecalData datum : manager.getAll() ) {
|
||||
//System.out.printf("Adding datum %s%n", datum);
|
||||
byte [][] table = cache.get(datum.dinuc);
|
||||
if ( table[datum.pos][datum.qual] != 0 )
|
||||
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
|
||||
//table[datum.pos][datum.qual] = (byte)(1 + datum.empiricalQualByte());
|
||||
table[datum.pos][datum.qual] = datum.empiricalQualByte();
|
||||
// System.out.printf("Binding %d %d => %d%n", datum.pos, datum.qual, datum.empiricalQualByte());
|
||||
if ( printStateP(datum) )
|
||||
System.out.printf("PrintValue: %s%n", datum);
|
||||
}
|
||||
|
||||
// Jaffe-style
|
||||
deltaQualMap = new double[maxQReported+1];
|
||||
for ( RecalData datum : RecalData.sort(manager.combine(true, false, true)) ) {
|
||||
deltaQualMap[datum.qual] = datum.empiricalQualDouble(useRawQempirical) - datum.qual - globalDeltaQ;
|
||||
System.out.printf("%s => %s%n", datum, deltaQualMap[datum.qual]);
|
||||
}
|
||||
|
||||
// calculate the delta Q pos array
|
||||
deltaQPosMap = new double[maxPos+1][maxQReported+1];
|
||||
qPosSupports = new RecalData[maxPos+1][maxQReported+1];
|
||||
for ( RecalData datumAtPosQual : manager.combineDinucs() ) {
|
||||
double offset = globalDeltaQ + deltaQualMap[datumAtPosQual.qual];
|
||||
updateCache(qPosSupports, datumAtPosQual, useRawQempirical, deltaQPosMap, datumAtPosQual.pos, datumAtPosQual.qual, offset);
|
||||
}
|
||||
|
||||
// calculate the delta Q dinuc array
|
||||
deltaQDinucMap = new double[dinucs.size()+1][maxQReported+1];
|
||||
qDinucSupports = new RecalData[dinucs.size()+1][maxQReported+1];
|
||||
for ( RecalData datumAtDinucQual : manager.combineCycles() ) {
|
||||
double offset = globalDeltaQ + deltaQualMap[datumAtDinucQual.qual];
|
||||
updateCache(qDinucSupports, datumAtDinucQual, useRawQempirical, deltaQDinucMap, datumAtDinucQual.getDinucIndex(), datumAtDinucQual.qual, offset);
|
||||
}
|
||||
|
||||
validateTables(maxPos, maxQReported, dinucs.size(), deltaQPosMap, deltaQDinucMap, useRawQempirical, manager.getAll());
|
||||
}
|
||||
|
||||
public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) {
|
||||
//String dinuc = String.format("%c%c", (char)prevBase, (char)base);
|
||||
//if ( qual == 2 )
|
||||
// System.out.printf("Qual = 2%n");
|
||||
|
||||
byte[] bp = {prevBase, base};
|
||||
String dinuc = manager.canonicalDinuc(new String(bp));
|
||||
int pos = manager.canonicalPos(cycle);
|
||||
byte[][] dataTable = cache.get(dinuc);
|
||||
|
||||
if ( dataTable == null && prevBase != 'N' && base != 'N' )
|
||||
throw new RuntimeException(String.format("Unmapped data table at %s %s", readGroup, dinuc));
|
||||
|
||||
byte result = dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual;
|
||||
|
||||
//if ( result == 2 )
|
||||
// System.out.printf("Lookup RG=%s dinuc=%s cycle=%d pos=%d qual=%d datatable=%s / %d => %d%n",
|
||||
// readGroup, dinuc, cycle, pos, qual, dataTable, dataTable.length, result);
|
||||
|
||||
return result;
|
||||
}
|
||||
}*/
|
||||
/*
|
||||
class SerialRecalMapping implements RecalMapping {
|
||||
// mapping from dinuc x Q => new Q
|
||||
HashMap<String, byte[]> mappingByDinuc;
|
||||
|
||||
// mapping from pos x Q => new Q
|
||||
byte[][] mappingByPos;
|
||||
|
||||
public SerialRecalMapping(RecalDataManager manager, Set<String> dinucs, int maxPos, int maxQReported ) {
|
||||
mappingByDinuc = new HashMap<String, byte[]>();
|
||||
for ( String dinuc : dinucs ) {
|
||||
byte[] table = new byte[maxQReported+1];
|
||||
mappingByDinuc.put(dinuc, table);
|
||||
}
|
||||
for ( RecalData datum : manager.getDataByDinuc() ) {
|
||||
//System.out.printf("Adding datum %s%n", datum);
|
||||
if ( mappingByDinuc.get(datum.dinuc).length <= datum.qual ) {
|
||||
throw new RuntimeException(String.format("Unexpectedly massive Q score of %d found, calculated max was %d%n", maxQReported, datum.qual));
|
||||
private void validateTables(int maxPos, int maxQReported, int nDinucs,
|
||||
double[][] deltaQPosMap, double[][] deltaQDinucMap,
|
||||
final boolean useRawQempirical, List<RecalData> data ) {
|
||||
for ( int i = 0; i < maxPos; i++ ) {
|
||||
for ( int j = 0; j < maxQReported; j++ ) {
|
||||
if ( printStateP(i, null, j) )
|
||||
System.out.printf("Mapping: pos=%d qual=%2d delta=%.2f based on %s%n",
|
||||
i, j, deltaQPosMap[i][j], qPosSupports[i][j]);
|
||||
}
|
||||
mappingByDinuc.get(datum.dinuc)[datum.qual] = datum.empiricalQualByte();
|
||||
}
|
||||
|
||||
// initialize the mapping by position
|
||||
mappingByPos = new byte[maxPos+1][maxQReported+1];
|
||||
for ( RecalData datum : manager.getDataByPos() ) {
|
||||
//System.out.printf("Adding datum %s%n", datum);
|
||||
mappingByPos[datum.pos][datum.qual] = datum.empiricalQualByte();
|
||||
for ( int i = 0; i < nDinucs; i++ ) {
|
||||
for ( int j = 0; j < maxQReported; j++ ) {
|
||||
String dinuc = RecalData.dinucIndex2bases(i);
|
||||
if ( printStateP(0, dinuc, j ) )
|
||||
System.out.printf("Mapping: dinuc=%s qual=%2d delta=%.2f based on %s%n",
|
||||
dinuc, j, deltaQDinucMap[i][j], qDinucSupports[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
for ( RecalData datum : RecalData.sort(data) ) {
|
||||
byte newQual = getNewQual(datum.readGroup, (byte)datum.dinuc.charAt(0), (byte)datum.dinuc.charAt(1), datum.pos, (byte)datum.qual);
|
||||
if ( printStateP( datum ) ) {
|
||||
System.out.printf("Serial mapping %s => %d => %d vs. %d => delta = %d%n",
|
||||
datum, newQual, datum.empiricalQualByte(useRawQempirical), datum.empiricalQualByte(), newQual - datum.empiricalQualByte(useRawQempirical));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void updateCache( RecalData[][] supports, RecalData datum, final boolean useRawQempirical,
|
||||
double[][] table, int i, int j, double meanQ ) {
|
||||
if ( table[i][j] != 0 )
|
||||
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
|
||||
double deltaQ = datum.empiricalQualDouble(useRawQempirical) - datum.qual - meanQ;
|
||||
table[i][j] = deltaQ;
|
||||
supports[i][j] = datum;
|
||||
}
|
||||
|
||||
private boolean printStateP( int cycle, String dinuc, int qual ) {
|
||||
return posToLookAt != -1 &&
|
||||
( cycle == 0 || posToLookAt == 0 || cycle == posToLookAt ) &&
|
||||
( dinucToLookAt == null || dinuc == null || manager.getDinucIndex(dinuc) == -1 || dinucToLookAt.equals(dinuc)) &&
|
||||
( qualToLookAt == 0 || qual == qualToLookAt );
|
||||
}
|
||||
|
||||
private boolean printStateP( RecalData datum ) {
|
||||
return printStateP(datum.pos, datum.dinuc, datum.qual);
|
||||
}
|
||||
|
||||
public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) {
|
||||
//System.out.printf("Lookup RG=%s prevBase=%c base=%c cycle=%d qual=%d%n", readGroup, prevBase, base, cycle, qual);
|
||||
//String dinuc = String.format("%c%c", (char)prevBase, (char)base);
|
||||
byte[] bp = {prevBase, base};
|
||||
String dinuc = new String(bp);
|
||||
int pos = manager.canonicalPos(cycle);
|
||||
int index = this.manager.getDinucIndex(prevBase, base);
|
||||
|
||||
byte newQualFromDinuc = 0;
|
||||
byte newQualFromPos = cycle > 0 && cycle < mappingByPos.length ? mappingByPos[cycle][qual] : qual;
|
||||
byte newQual = newQualFromPos;
|
||||
if ( prevBase != 'N' && base != 'N' ) {
|
||||
// if the qual got mapped too high, assume it's the best we've seen for recalibration purposes
|
||||
int newQualIndex = newQual < mappingByDinuc.get(dinuc).length ? newQual : mappingByDinuc.get(dinuc).length - 1;
|
||||
newQualFromDinuc = mappingByDinuc.get(dinuc)[newQualIndex];
|
||||
newQual = newQualFromDinuc;
|
||||
}
|
||||
double deltaQual = deltaQualMap[qual];
|
||||
double deltaQPos = pos >= deltaQPosMap.length ? 0.0 : deltaQPosMap[pos][qual]; // == length indices no data for last base
|
||||
double deltaQDinuc = index == -1 ? 0.0 : deltaQDinucMap[index][qual]; // -1 indices N in prev or current base
|
||||
double newQual = qual + globalDeltaQ + deltaQual + deltaQPos + deltaQDinuc;
|
||||
byte newQualByte = QualityUtils.boundQual((int)Math.round(newQual), QualityUtils.MAX_REASONABLE_Q_SCORE);
|
||||
|
||||
//System.out.printf("Lookup RG=%s prevBase=%c base=%c cycle=%d qual=%d => %d => %d => %d%n",
|
||||
// readGroup, prevBase, base, cycle, qual, newQualFromPos, newQualFromDinuc, newQual);
|
||||
//if ( printStateP(pos, RecalData.dinucIndex2bases(index), qual) )
|
||||
// System.out.printf("%s %c%c %d %d => %d + %.2f + %.2f + %.2f + %.2f = %d%n",
|
||||
// readGroup, prevBase, base, cycle, qual,
|
||||
// qual, globalDeltaQ, deltaQual, deltaQPos, deltaQDinuc,
|
||||
// newQualByte);
|
||||
|
||||
|
||||
return newQual;
|
||||
//byte combiQualByte = combiMap.getNewQual(readGroup, prevBase, base, cycle, qual);
|
||||
//System.out.printf("%s %c%c %d %d => %d + %.2f + %.2f + %.2f = %d vs %d (%d)%n",
|
||||
// readGroup, prevBase, base, cycle, qual,
|
||||
// qual, globalDeltaQ, deltaQPos, deltaQDinuc,
|
||||
// newQualByte, combiQualByte, newQualByte - combiQualByte);
|
||||
|
||||
if ( newQualByte <= 0 && newQualByte >= QualityUtils.MAX_REASONABLE_Q_SCORE )
|
||||
throw new RuntimeException(String.format("Illegal base quality score calculated: %s %c%c %d %d => %d + %.2f + %.2f + %.2f = %d",
|
||||
readGroup, prevBase, base, cycle, qual, qual, globalDeltaQ, deltaQPos, deltaQDinuc, newQualByte));
|
||||
|
||||
return newQualByte;
|
||||
}
|
||||
}*/
|
||||
}
|
||||
|
|
@ -67,6 +67,14 @@ public class QualityUtils {
|
|||
return b;
|
||||
}
|
||||
|
||||
static public double phredScaleCorrectRate(double trueRate) {
|
||||
return phredScaleErrorRate(1-trueRate);
|
||||
}
|
||||
|
||||
static public double phredScaleErrorRate(double errorRate) {
|
||||
return -10.0*Math.log10(errorRate);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a quality score, capped at 63.
|
||||
*
|
||||
|
|
@ -74,7 +82,11 @@ public class QualityUtils {
|
|||
* @return the capped quality score
|
||||
*/
|
||||
static public byte boundQual(int qual) {
|
||||
return (byte) Math.min(qual, MAX_QUAL_SCORE);
|
||||
return boundQual(qual, MAX_QUAL_SCORE);
|
||||
}
|
||||
|
||||
static public byte boundQual(int qual, byte maxQual) {
|
||||
return (byte) Math.min(qual, maxQual);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
Loading…
Reference in New Issue