Another temp checking for rearranging things

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1048 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-06-18 21:04:36 +00:00
parent 3c40db260d
commit ca8a3bd85e
5 changed files with 167 additions and 61 deletions

View File

@ -55,7 +55,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
@Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="") @Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="")
public boolean collapseDinuc = false; public boolean collapseDinuc = false;
int NDINUCS = 16;
//ArrayList<RecalData> flattenData = new ArrayList<RecalData>(); //ArrayList<RecalData> flattenData = new ArrayList<RecalData>();
//HashMap<String, RecalData[][][]> data = new HashMap<String, RecalData[][][]>(); //HashMap<String, RecalData[][][]> data = new HashMap<String, RecalData[][][]>();
HashMap<String, RecalDataManager> data = new HashMap<String, RecalDataManager>(); HashMap<String, RecalDataManager> data = new HashMap<String, RecalDataManager>();
@ -76,7 +75,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
if( !isSupportedReadGroup(readGroup) ) if( !isSupportedReadGroup(readGroup) )
continue; continue;
String rg = readGroup.getReadGroupId(); String rg = readGroup.getReadGroupId();
RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, NDINUCS, ! collapsePos, ! collapseDinuc ); RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, RecalData.NDINUCS, ! collapsePos, ! collapseDinuc );
data.put(rg, manager); data.put(rg, manager);
} }
} }
@ -142,7 +141,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
int qual = quals[offset]; int qual = quals[offset];
if ( qual > 0 && qual <= QualityUtils.MAX_QUAL_SCORE ) { if ( qual > 0 && qual <= QualityUtils.MAX_QUAL_SCORE ) {
// previous base is the next base in terms of machine chemistry if this is a negative strand // previous base is the next base in terms of machine chemistry if this is a negative strand
//System.out.printf("Adding b_offset=%c offset=%d cycle=%d qual=%d dinuc=%c%c ref_match=%c comp=%c%n", (char)read.getReadBases()[offset], offset, cycle, qual, prevBase, base, ref, (char)BaseUtils.simpleComplement(ref)); // if ( qual == 2 )
//if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") )
// System.out.printf("Adding neg?=%b b_offset=%c offset=%d cycle=%d qual=%d dinuc=%c%c ref_match=%c comp=%c name=%s%n",
// read.getReadNegativeStrandFlag(), (char)read.getReadBases()[offset], offset, cycle, qual, prevBase, base,
// ref, (char)BaseUtils.simpleComplement(ref), read.getReadName());
RecalData datum = getRecalData(rg, cycle, qual, prevBase, base); RecalData datum = getRecalData(rg, cycle, qual, prevBase, base);
if (datum != null) datum.inc(base,ref); if (datum != null) datum.inc(base,ref);
return 1; return 1;
@ -174,7 +177,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
if (CREATE_TRAINING_DATA) writeTrainingData(); if (CREATE_TRAINING_DATA) writeTrainingData();
} }
void printInfo(PrintStream out) { private void printInfo(PrintStream out) {
out.printf("# date %s%n", new Date()); out.printf("# date %s%n", new Date());
out.printf("# collapsed_pos %b%n", collapsePos); out.printf("# collapsed_pos %b%n", collapsePos);
out.printf("# collapsed_dinuc %b%n", collapseDinuc); out.printf("# collapsed_dinuc %b%n", collapseDinuc);
@ -185,16 +188,16 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
} }
void writeTrainingData() { private void writeTrainingData() {
PrintStream dinuc_out = null; PrintStream dinuc_out = null;
PrintStream table_out = null; PrintStream table_out = null;
try { try {
dinuc_out = new PrintStream( OUTPUT_FILEROOT+".covariate_counts.csv"); dinuc_out = new PrintStream( OUTPUT_FILEROOT+".covariate_counts.csv");
dinuc_out.println("rg,dn,logitQ,pos,indicator,count"); dinuc_out.println("rg,dn,logitQ,pos,indicator,count");
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) { for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
for ( int dinuc_index=0; dinuc_index<NDINUCS; dinuc_index++) { for ( int dinuc_index=0; dinuc_index<RecalData.NDINUCS; dinuc_index++) {
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) { for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
if ( RecalData.string2dinucIndex(datum.dinuc) == dinuc_index ) { if ( RecalData.dinucIndex(datum.dinuc) == dinuc_index ) {
if ((datum.N - datum.B) > 0) if ((datum.N - datum.B) > 0)
dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup.getReadGroupId(), RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 0, datum.N - datum.B); dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup.getReadGroupId(), RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 0, datum.N - datum.B);
if (datum.B > 0) if (datum.B > 0)
@ -207,7 +210,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
if ( outputRawData ) { if ( outputRawData ) {
table_out = new PrintStream( OUTPUT_FILEROOT+".raw_data.csv"); table_out = new PrintStream( OUTPUT_FILEROOT+".raw_data.csv");
printInfo(table_out); printInfo(table_out);
table_out.println("rg,dn,Qrep,pos,NBases,MMismatches"); table_out.println("rg,dn,Qrep,pos,NBases,MMismatches,Qemp");
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) { for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) { for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
if ( datum.N > 0 ) if ( datum.N > 0 )
@ -295,20 +298,20 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
ByDinucFile.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n"); ByDinucFile.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n");
RecalData All = new RecalData(0,0,readGroup.getReadGroupId(),""); RecalData All = new RecalData(0,0,readGroup.getReadGroupId(),"");
MeanReportedQuality AllReported = new MeanReportedQuality(); MeanReportedQuality AllReported = new MeanReportedQuality();
for (int c=0; c < NDINUCS; c++) { for (int c=0; c < RecalData.NDINUCS; c++) {
ByCycle.add(new RecalData(-1, -1,readGroup.getReadGroupId(),RecalData.dinucIndex2bases(c))); ByCycle.add(new RecalData(-1, -1,readGroup.getReadGroupId(),RecalData.dinucIndex2bases(c)));
ByCycleReportedQ.add(new MeanReportedQuality()); ByCycleReportedQ.add(new MeanReportedQuality());
} }
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) { for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
int dinucIndex = RecalData.string2dinucIndex(datum.dinuc); //bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1), false); int dinucIndex = RecalData.dinucIndex(datum.dinuc); //bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1), false);
ByCycle.get(dinucIndex).inc(datum.N, datum.B); ByCycle.get(dinucIndex).inc(datum.N, datum.B);
ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N); ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N);
All.inc(datum.N, datum.B); All.inc(datum.N, datum.B);
AllReported.inc(datum.qual, datum.N); AllReported.inc(datum.qual, datum.N);
} }
for (int c=0; c < NDINUCS; c++) { for (int c=0; c < RecalData.NDINUCS; c++) {
double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N); double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N);
double reportedQual = ByCycleReportedQ.get(c).result(); double reportedQual = ByCycleReportedQ.get(c).result();
ByDinucFile.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N); ByDinucFile.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
@ -347,7 +350,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
for (int q=0; q<QualityUtils.MAX_QUAL_SCORE; q++) { for (int q=0; q<QualityUtils.MAX_QUAL_SCORE; q++) {
double empiricalQual = -10 * Math.log10((double)ByQ.get(q).B / ByQ.get(q).N); double empiricalQual = -10 * Math.log10((double)ByQ.get(q).B / ByQ.get(q).N);
ByQualFile.printf("%d, %f, %.0f, %d, %d%n", q, empiricalQual, ByQReportedQ.get(q).result(), ByQ.get(q).B, ByQ.get(q).N); ByQualFile.printf("%3d, %2.2f, %2.2f, %12d, %12d%n", q, empiricalQual, ByQReportedQ.get(q).result(), ByQ.get(q).B, ByQ.get(q).N);
//out.printf("%3d,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, dinuc, qual, empiricalQual, qual-empiricalQual, N, B); n //out.printf("%3d,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, dinuc, qual, empiricalQual, qual-empiricalQual, N, B); n
} }
} }

View File

@ -3,6 +3,8 @@ package org.broadinstitute.sting.playground.gatk.walkers.recalibration;
import net.sf.samtools.*; import net.sf.samtools.*;
import org.broadinstitute.sting.gatk.walkers.WalkerName; import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.*;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
@ -12,6 +14,7 @@ import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
@WalkerName("LogisticRecalibration") @WalkerName("LogisticRecalibration")
@Requires({DataSource.READS, DataSource.REFERENCE})
public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> { public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
@Argument(shortName="logisticParams", doc="logistic params file", required=true) @Argument(shortName="logisticParams", doc="logistic params file", required=true)
public String logisticParamsFile; public String logisticParamsFile;
@ -185,7 +188,7 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
} }
if (read.getReadNegativeStrandFlag()) if (read.getReadNegativeStrandFlag())
recalQuals = BaseUtils.reverse(quals); recalQuals = BaseUtils.reverse(recalQuals);
//System.out.printf("OLD: %s%n", read.format()); //System.out.printf("OLD: %s%n", read.format());
read.setBaseQualities(recalQuals); read.setBaseQualities(recalQuals);
//System.out.printf("NEW: %s%n", read.format()); //System.out.printf("NEW: %s%n", read.format());

View File

@ -1,7 +1,6 @@
package org.broadinstitute.sting.playground.gatk.walkers.recalibration; package org.broadinstitute.sting.playground.gatk.walkers.recalibration;
import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;
public class RecalData { public class RecalData {
@ -53,7 +52,7 @@ public class RecalData {
} }
public String toCSVString(boolean collapsedPos) { public String toCSVString(boolean collapsedPos) {
return String.format("%s,%s,%d,%s,%d,%d", readGroup, dinuc, qual, collapsedPos ? "*" : pos, N, B); return String.format("%s,%s,%d,%s,%d,%d,%d", readGroup, dinuc, qual, collapsedPos ? "*" : pos, N, B, empiricalQualByte());
} }
public static RecalData fromCSVString(String s) { public static RecalData fromCSVString(String s) {
@ -74,25 +73,26 @@ public class RecalData {
return datum; return datum;
} }
public static int bases2dinucIndex(char prevBase, char base, boolean Complement) { public static int bases2dinucIndex(char prevBase, char base) {
if ( BaseUtils.simpleBaseToBaseIndex(prevBase) == -1 || BaseUtils.simpleBaseToBaseIndex(base) == -1 ) int pbI = BaseUtils.simpleBaseToBaseIndex(prevBase);
return -1; int bI = BaseUtils.simpleBaseToBaseIndex(base);
return (pbI == -1 || bI == -1) ? -1 : pbI * 4 + bI;
if (!Complement) {
return BaseUtils.simpleBaseToBaseIndex(prevBase) * 4 + BaseUtils.simpleBaseToBaseIndex(base);
}else {
return (3 - BaseUtils.simpleBaseToBaseIndex(prevBase)) * 4 + (3 - BaseUtils.simpleBaseToBaseIndex(base));
}
} }
public final static int NDINUCS = 16;
public static String dinucIndex2bases(int index) { public static String dinucIndex2bases(int index) {
char data[] = {BaseUtils.baseIndexToSimpleBase(index / 4), BaseUtils.baseIndexToSimpleBase(index % 4)}; char data[] = {BaseUtils.baseIndexToSimpleBase(index / 4), BaseUtils.baseIndexToSimpleBase(index % 4)};
return new String( data ); return new String( data );
} }
public static int string2dinucIndex(String s) { public static int dinucIndex(String s) {
return bases2dinucIndex(s.charAt(0), s.charAt(1), false); return bases2dinucIndex(s.charAt(0), s.charAt(1));
} }
public static int dinucIndex(byte prevBase, byte base) {
return bases2dinucIndex((char)prevBase, (char)base);
}
// //
// private static int nuc2num[]; // private static int nuc2num[];
// private static char num2nuc[]; // private static char num2nuc[];

View File

@ -33,12 +33,21 @@ public class RecalDataManager {
return trackPos ? pos : 0; return trackPos ? pos : 0;
} }
public int getDinucIndex(String dinuc) { public int canonicalPos(int cycle) {
if ( trackDinuc ) { return getPosIndex(cycle);
return RecalData.string2dinucIndex(dinuc);
} else {
return 0;
} }
public int getDinucIndex(String dinuc) {
return trackDinuc ? RecalData.dinucIndex(dinuc) : 0;
}
public int getDinucIndex(byte prevBase, byte base) {
return trackDinuc ? RecalData.dinucIndex(prevBase, base) : 0;
}
public String canonicalDinuc(String dinuc) {
return trackDinuc ? dinuc : "**";
} }
public void addDatum(RecalData datum) { public void addDatum(RecalData datum) {

View File

@ -3,16 +3,21 @@ package org.broadinstitute.sting.playground.gatk.walkers.recalibration;
import net.sf.samtools.*; import net.sf.samtools.*;
import org.broadinstitute.sting.gatk.walkers.WalkerName; import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.playground.gatk.walkers.recalibration.RecalData; import org.broadinstitute.sting.playground.gatk.walkers.recalibration.RecalData;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import java.util.*; import java.util.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
@WalkerName("TableRecalibration") @WalkerName("TableRecalibration")
@Requires({DataSource.READS, DataSource.REFERENCE})
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> { public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
@Argument(shortName="params", doc="CountCovariates params file", required=true) @Argument(shortName="params", doc="CountCovariates params file", required=true)
public String paramsFile; public String paramsFile;
@ -27,26 +32,52 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
// maps from [readGroup] -> [prevBase x base -> [cycle, qual, new qual]] // maps from [readGroup] -> [prevBase x base -> [cycle, qual, new qual]]
HashMap<String, RecalMapping> cache = new HashMap<String, RecalMapping>(); HashMap<String, RecalMapping> cache = new HashMap<String, RecalMapping>();
@Argument(shortName="serial", doc="", required=false) //@Argument(shortName="serial", doc="", required=false)
boolean serialRecalibration = false; boolean serialRecalibration = false;
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
private static Pattern COLLAPSED_POS_PATTERN = Pattern.compile("^#\\s+collapsed_pos\\s+(\\w+)");
private static Pattern COLLAPSED_DINUC_PATTERN = Pattern.compile("^#\\s+collapsed_dinuc\\s+(\\w+)");
private static Pattern HEADER_PATTERN = Pattern.compile("^rg.*");
public void initialize() { public void initialize() {
try { try {
System.out.printf("Reading data...%n"); System.out.printf("Reading data...%n");
List<RecalData> data = new ArrayList<RecalData>(); List<RecalData> data = new ArrayList<RecalData>();
boolean collapsedPos = false;
boolean collapsedDinuc = false;
List<String> lines = new xReadLines(new File(paramsFile)).readLines(); List<String> lines = new xReadLines(new File(paramsFile)).readLines();
for ( String line : lines ) { for ( String line : lines ) {
// rg,dn,logitQ,pos,indicator,count //System.out.printf("Reading line %s%n", line);
// SRR007069,AA,28,1,0,2 if ( HEADER_PATTERN.matcher(line).matches() )
continue;
if ( COMMENT_PATTERN.matcher(line).matches() ) {
collapsedPos = parseCommentLine(COLLAPSED_POS_PATTERN, line, collapsedPos);
collapsedDinuc = parseCommentLine(COLLAPSED_DINUC_PATTERN, line, collapsedDinuc);
//System.out.printf("Collapsed %b %b%n", collapsedPos, collapsedDinuc);
}
else {
data.add(RecalData.fromCSVString(line)); data.add(RecalData.fromCSVString(line));
} }
initializeCache(data); }
initializeCache(data, collapsedPos, collapsedDinuc);
} catch ( FileNotFoundException e ) { } catch ( FileNotFoundException e ) {
Utils.scareUser("Cannot read/find parameters file " + paramsFile); Utils.scareUser("Cannot read/find parameters file " + paramsFile);
} }
} }
private void initializeCache(List<RecalData> data) { private boolean parseCommentLine(Pattern pat, String line, boolean flag) {
Matcher m = pat.matcher(line);
if ( m.matches() ) {
//System.out.printf("Parsing %s%n", m.group(1));
flag = Boolean.parseBoolean(m.group(1));
}
return flag;
}
private void initializeCache(List<RecalData> data, boolean collapsedPos, boolean collapsedDinuc ) {
Set<String> readGroups = new HashSet<String>(); Set<String> readGroups = new HashSet<String>();
Set<String> dinucs = new HashSet<String>(); Set<String> dinucs = new HashSet<String>();
int maxPos = -1; int maxPos = -1;
@ -68,7 +99,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
// initialize the data structure // initialize the data structure
HashMap<String, RecalDataManager> managers = new HashMap<String, RecalDataManager>(); HashMap<String, RecalDataManager> managers = new HashMap<String, RecalDataManager>();
for ( String readGroup : readGroups ) { for ( String readGroup : readGroups ) {
RecalDataManager manager = new RecalDataManager(readGroup, maxPos, maxQReported, dinucs.size(), true, true); RecalDataManager manager = new RecalDataManager(readGroup, maxPos, maxQReported, dinucs.size(), ! collapsedPos, ! collapsedDinuc);
managers.put(readGroup, manager); managers.put(readGroup, manager);
} }
@ -90,13 +121,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
} }
public SAMRecord map(char[] ref, SAMRecord read) { public SAMRecord map(char[] ref, SAMRecord read) {
//if ( read.getReadLength() > maxReadLen ) {
// throw new RuntimeException("Expectedly long read, please increase maxium read len with maxReadLen parameter: " + read.format());
//}
byte[] bases = read.getReadBases(); byte[] bases = read.getReadBases();
byte[] quals = read.getBaseQualities(); byte[] quals = read.getBaseQualities();
byte[] recalQuals = new byte[quals.length];
// Since we want machine direction reads not corrected positive strand reads, rev comp any negative strand reads // Since we want machine direction reads not corrected positive strand reads, rev comp any negative strand reads
if (read.getReadNegativeStrandFlag()) { if (read.getReadNegativeStrandFlag()) {
@ -104,28 +130,34 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
quals = BaseUtils.reverse(quals); quals = BaseUtils.reverse(quals);
} }
String readGroup = read.getAttribute("RG").toString(); byte[] recalQuals = recalibrateBasesAndQuals(read.getAttribute("RG").toString(), bases, quals);
if (read.getReadNegativeStrandFlag())
recalQuals = BaseUtils.reverse(recalQuals);
//if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") )
// System.out.printf("OLD: %s%n", read.format());
read.setBaseQualities(recalQuals);
//if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") )
// System.out.printf("NEW: %s%n", read.format());
return read;
}
public byte[] recalibrateBasesAndQuals(final String readGroup, byte[] bases, byte[] quals) {
byte[] recalQuals = new byte[quals.length];
RecalMapping mapper = cache.get(readGroup); RecalMapping mapper = cache.get(readGroup);
int numBases = read.getReadLength();
recalQuals[0] = quals[0]; // can't change the first -- no dinuc recalQuals[0] = quals[0]; // can't change the first -- no dinuc
for ( int cycle = 1; cycle < bases.length; cycle++ ) { // skip first and last base, qual already set because no dinuc
for ( int cycle = 1; cycle < numBases; cycle++ ) { // skip first and last base, qual already set because no dinuc
// Take into account that previous base is the next base in terms of machine chemistry if
// this is a negative strand
byte qual = quals[cycle]; byte qual = quals[cycle];
byte newQual = mapper.getNewQual(readGroup, bases[cycle - 1], bases[cycle], cycle, qual); byte newQual = mapper.getNewQual(readGroup, bases[cycle - 1], bases[cycle], cycle, qual);
//if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") )
// System.out.printf("Processing cycle=%d qual=%d: neg?=%b => %d at %s%n",
// cycle, qual, read.getReadNegativeStrandFlag(), newQual, read.getReadName());
recalQuals[cycle] = newQual; recalQuals[cycle] = newQual;
//System.out.printf("Mapping %d => %d%n", qual, newQual); //System.out.printf("Mapping %d => %d%n", qual, newQual);
} }
if (read.getReadNegativeStrandFlag()) return recalQuals;
recalQuals = BaseUtils.reverse(quals);
//System.out.printf("OLD: %s%n", read.format());
read.setBaseQualities(recalQuals);
//System.out.printf("NEW: %s%n", read.format());
return read;
} }
public void onTraversalDone(SAMFileWriter output) { public void onTraversalDone(SAMFileWriter output) {
@ -164,9 +196,58 @@ interface RecalMapping {
} }
class CombinatorialRecalMapping implements RecalMapping { class CombinatorialRecalMapping implements RecalMapping {
HashMap<String, byte[][]> cache = new HashMap<String, byte[][]>(); ArrayList<byte[][]> cache;
RecalDataManager manager;
public CombinatorialRecalMapping(RecalDataManager manager, Set<String> dinucs, int maxPos, int maxQReported ) { public CombinatorialRecalMapping(RecalDataManager manager, Set<String> dinucs, int maxPos, int maxQReported ) {
this.manager = manager;
// initialize the data structure
cache = new ArrayList<byte[][]>(RecalData.NDINUCS);
for ( String dinuc : dinucs ) {
cache.add(new byte[maxPos+1][maxQReported+1]);
}
for ( RecalData datum : manager.getAll() ) {
//System.out.printf("Adding datum %s%n", datum);
byte [][] table = cache.get(this.manager.getDinucIndex(datum.dinuc));
if ( table[datum.pos][datum.qual] != 0 )
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
//table[datum.pos][datum.qual] = (byte)(1 + datum.empiricalQualByte());
table[datum.pos][datum.qual] = datum.empiricalQualByte();
// System.out.printf("Binding %d %d => %d%n", datum.pos, datum.qual, datum.empiricalQualByte());
}
}
public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) {
//String dinuc = String.format("%c%c", (char)prevBase, (char)base);
//if ( qual == 2 )
// System.out.printf("Qual = 2%n");
int pos = manager.canonicalPos(cycle);
int index = this.manager.getDinucIndex(prevBase, base);
byte[][] dataTable = index == -1 ? null : cache.get(index);
if ( dataTable == null && prevBase != 'N' && base != 'N' )
throw new RuntimeException(String.format("Unmapped data table at %s %c%c", readGroup, prevBase, base));
byte result = dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual;
//if ( result == 2 )
// System.out.printf("Lookup RG=%s dinuc=%s cycle=%d pos=%d qual=%d datatable=%s / %d => %d%n",
// readGroup, dinuc, cycle, pos, qual, dataTable, dataTable.length, result);
return result;
}
}
/*class CombinatorialRecalMapping implements RecalMapping {
HashMap<String, byte[][]> cache = new HashMap<String, byte[][]>();
RecalDataManager manager;
public CombinatorialRecalMapping(RecalDataManager manager, Set<String> dinucs, int maxPos, int maxQReported ) {
this.manager = manager;
// initialize the data structure // initialize the data structure
for ( String dinuc : dinucs ) { for ( String dinuc : dinucs ) {
byte[][] table = new byte[maxPos+1][maxQReported+1]; byte[][] table = new byte[maxPos+1][maxQReported+1];
@ -180,22 +261,32 @@ class CombinatorialRecalMapping implements RecalMapping {
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum)); throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
//table[datum.pos][datum.qual] = (byte)(1 + datum.empiricalQualByte()); //table[datum.pos][datum.qual] = (byte)(1 + datum.empiricalQualByte());
table[datum.pos][datum.qual] = datum.empiricalQualByte(); table[datum.pos][datum.qual] = datum.empiricalQualByte();
// System.out.printf("Binding %d %d => %d%n", datum.pos, datum.qual, datum.empiricalQualByte());
} }
} }
public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) { public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) {
//System.out.printf("Lookup RG=%s prevBase=%c base=%c cycle=%d qual=%d%n", readGroup, prevBase, base, cycle, qual);
//String dinuc = String.format("%c%c", (char)prevBase, (char)base); //String dinuc = String.format("%c%c", (char)prevBase, (char)base);
//if ( qual == 2 )
// System.out.printf("Qual = 2%n");
byte[] bp = {prevBase, base}; byte[] bp = {prevBase, base};
String dinuc = new String(bp); String dinuc = manager.canonicalDinuc(new String(bp));
int pos = manager.canonicalPos(cycle);
byte[][] dataTable = cache.get(dinuc); byte[][] dataTable = cache.get(dinuc);
if ( dataTable == null && prevBase != 'N' && base != 'N' ) if ( dataTable == null && prevBase != 'N' && base != 'N' )
throw new RuntimeException(String.format("Unmapped data table at %s %s", readGroup, dinuc)); throw new RuntimeException(String.format("Unmapped data table at %s %s", readGroup, dinuc));
return dataTable != null && cycle < dataTable.length ? dataTable[cycle][qual] : qual; byte result = dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual;
}
//if ( result == 2 )
// System.out.printf("Lookup RG=%s dinuc=%s cycle=%d pos=%d qual=%d datatable=%s / %d => %d%n",
// readGroup, dinuc, cycle, pos, qual, dataTable, dataTable.length, result);
return result;
} }
}*/
class SerialRecalMapping implements RecalMapping { class SerialRecalMapping implements RecalMapping {
// mapping from dinuc x Q => new Q // mapping from dinuc x Q => new Q