Another temp checking for rearranging things
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1048 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
3c40db260d
commit
ca8a3bd85e
|
|
@ -55,7 +55,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
@Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="")
|
@Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="")
|
||||||
public boolean collapseDinuc = false;
|
public boolean collapseDinuc = false;
|
||||||
|
|
||||||
int NDINUCS = 16;
|
|
||||||
//ArrayList<RecalData> flattenData = new ArrayList<RecalData>();
|
//ArrayList<RecalData> flattenData = new ArrayList<RecalData>();
|
||||||
//HashMap<String, RecalData[][][]> data = new HashMap<String, RecalData[][][]>();
|
//HashMap<String, RecalData[][][]> data = new HashMap<String, RecalData[][][]>();
|
||||||
HashMap<String, RecalDataManager> data = new HashMap<String, RecalDataManager>();
|
HashMap<String, RecalDataManager> data = new HashMap<String, RecalDataManager>();
|
||||||
|
|
@ -76,7 +75,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
if( !isSupportedReadGroup(readGroup) )
|
if( !isSupportedReadGroup(readGroup) )
|
||||||
continue;
|
continue;
|
||||||
String rg = readGroup.getReadGroupId();
|
String rg = readGroup.getReadGroupId();
|
||||||
RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, NDINUCS, ! collapsePos, ! collapseDinuc );
|
RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, RecalData.NDINUCS, ! collapsePos, ! collapseDinuc );
|
||||||
data.put(rg, manager);
|
data.put(rg, manager);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -142,7 +141,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
int qual = quals[offset];
|
int qual = quals[offset];
|
||||||
if ( qual > 0 && qual <= QualityUtils.MAX_QUAL_SCORE ) {
|
if ( qual > 0 && qual <= QualityUtils.MAX_QUAL_SCORE ) {
|
||||||
// previous base is the next base in terms of machine chemistry if this is a negative strand
|
// previous base is the next base in terms of machine chemistry if this is a negative strand
|
||||||
//System.out.printf("Adding b_offset=%c offset=%d cycle=%d qual=%d dinuc=%c%c ref_match=%c comp=%c%n", (char)read.getReadBases()[offset], offset, cycle, qual, prevBase, base, ref, (char)BaseUtils.simpleComplement(ref));
|
// if ( qual == 2 )
|
||||||
|
//if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") )
|
||||||
|
// System.out.printf("Adding neg?=%b b_offset=%c offset=%d cycle=%d qual=%d dinuc=%c%c ref_match=%c comp=%c name=%s%n",
|
||||||
|
// read.getReadNegativeStrandFlag(), (char)read.getReadBases()[offset], offset, cycle, qual, prevBase, base,
|
||||||
|
// ref, (char)BaseUtils.simpleComplement(ref), read.getReadName());
|
||||||
RecalData datum = getRecalData(rg, cycle, qual, prevBase, base);
|
RecalData datum = getRecalData(rg, cycle, qual, prevBase, base);
|
||||||
if (datum != null) datum.inc(base,ref);
|
if (datum != null) datum.inc(base,ref);
|
||||||
return 1;
|
return 1;
|
||||||
|
|
@ -174,7 +177,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
if (CREATE_TRAINING_DATA) writeTrainingData();
|
if (CREATE_TRAINING_DATA) writeTrainingData();
|
||||||
}
|
}
|
||||||
|
|
||||||
void printInfo(PrintStream out) {
|
private void printInfo(PrintStream out) {
|
||||||
out.printf("# date %s%n", new Date());
|
out.printf("# date %s%n", new Date());
|
||||||
out.printf("# collapsed_pos %b%n", collapsePos);
|
out.printf("# collapsed_pos %b%n", collapsePos);
|
||||||
out.printf("# collapsed_dinuc %b%n", collapseDinuc);
|
out.printf("# collapsed_dinuc %b%n", collapseDinuc);
|
||||||
|
|
@ -185,16 +188,16 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void writeTrainingData() {
|
private void writeTrainingData() {
|
||||||
PrintStream dinuc_out = null;
|
PrintStream dinuc_out = null;
|
||||||
PrintStream table_out = null;
|
PrintStream table_out = null;
|
||||||
try {
|
try {
|
||||||
dinuc_out = new PrintStream( OUTPUT_FILEROOT+".covariate_counts.csv");
|
dinuc_out = new PrintStream( OUTPUT_FILEROOT+".covariate_counts.csv");
|
||||||
dinuc_out.println("rg,dn,logitQ,pos,indicator,count");
|
dinuc_out.println("rg,dn,logitQ,pos,indicator,count");
|
||||||
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
||||||
for ( int dinuc_index=0; dinuc_index<NDINUCS; dinuc_index++) {
|
for ( int dinuc_index=0; dinuc_index<RecalData.NDINUCS; dinuc_index++) {
|
||||||
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
|
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
|
||||||
if ( RecalData.string2dinucIndex(datum.dinuc) == dinuc_index ) {
|
if ( RecalData.dinucIndex(datum.dinuc) == dinuc_index ) {
|
||||||
if ((datum.N - datum.B) > 0)
|
if ((datum.N - datum.B) > 0)
|
||||||
dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup.getReadGroupId(), RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 0, datum.N - datum.B);
|
dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup.getReadGroupId(), RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 0, datum.N - datum.B);
|
||||||
if (datum.B > 0)
|
if (datum.B > 0)
|
||||||
|
|
@ -207,7 +210,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
if ( outputRawData ) {
|
if ( outputRawData ) {
|
||||||
table_out = new PrintStream( OUTPUT_FILEROOT+".raw_data.csv");
|
table_out = new PrintStream( OUTPUT_FILEROOT+".raw_data.csv");
|
||||||
printInfo(table_out);
|
printInfo(table_out);
|
||||||
table_out.println("rg,dn,Qrep,pos,NBases,MMismatches");
|
table_out.println("rg,dn,Qrep,pos,NBases,MMismatches,Qemp");
|
||||||
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
||||||
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
|
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
|
||||||
if ( datum.N > 0 )
|
if ( datum.N > 0 )
|
||||||
|
|
@ -295,20 +298,20 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
ByDinucFile.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n");
|
ByDinucFile.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n");
|
||||||
RecalData All = new RecalData(0,0,readGroup.getReadGroupId(),"");
|
RecalData All = new RecalData(0,0,readGroup.getReadGroupId(),"");
|
||||||
MeanReportedQuality AllReported = new MeanReportedQuality();
|
MeanReportedQuality AllReported = new MeanReportedQuality();
|
||||||
for (int c=0; c < NDINUCS; c++) {
|
for (int c=0; c < RecalData.NDINUCS; c++) {
|
||||||
ByCycle.add(new RecalData(-1, -1,readGroup.getReadGroupId(),RecalData.dinucIndex2bases(c)));
|
ByCycle.add(new RecalData(-1, -1,readGroup.getReadGroupId(),RecalData.dinucIndex2bases(c)));
|
||||||
ByCycleReportedQ.add(new MeanReportedQuality());
|
ByCycleReportedQ.add(new MeanReportedQuality());
|
||||||
}
|
}
|
||||||
|
|
||||||
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
|
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
|
||||||
int dinucIndex = RecalData.string2dinucIndex(datum.dinuc); //bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1), false);
|
int dinucIndex = RecalData.dinucIndex(datum.dinuc); //bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1), false);
|
||||||
ByCycle.get(dinucIndex).inc(datum.N, datum.B);
|
ByCycle.get(dinucIndex).inc(datum.N, datum.B);
|
||||||
ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N);
|
ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N);
|
||||||
All.inc(datum.N, datum.B);
|
All.inc(datum.N, datum.B);
|
||||||
AllReported.inc(datum.qual, datum.N);
|
AllReported.inc(datum.qual, datum.N);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int c=0; c < NDINUCS; c++) {
|
for (int c=0; c < RecalData.NDINUCS; c++) {
|
||||||
double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N);
|
double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N);
|
||||||
double reportedQual = ByCycleReportedQ.get(c).result();
|
double reportedQual = ByCycleReportedQ.get(c).result();
|
||||||
ByDinucFile.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
|
ByDinucFile.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
|
||||||
|
|
@ -347,7 +350,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
|
|
||||||
for (int q=0; q<QualityUtils.MAX_QUAL_SCORE; q++) {
|
for (int q=0; q<QualityUtils.MAX_QUAL_SCORE; q++) {
|
||||||
double empiricalQual = -10 * Math.log10((double)ByQ.get(q).B / ByQ.get(q).N);
|
double empiricalQual = -10 * Math.log10((double)ByQ.get(q).B / ByQ.get(q).N);
|
||||||
ByQualFile.printf("%d, %f, %.0f, %d, %d%n", q, empiricalQual, ByQReportedQ.get(q).result(), ByQ.get(q).B, ByQ.get(q).N);
|
ByQualFile.printf("%3d, %2.2f, %2.2f, %12d, %12d%n", q, empiricalQual, ByQReportedQ.get(q).result(), ByQ.get(q).B, ByQ.get(q).N);
|
||||||
//out.printf("%3d,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, dinuc, qual, empiricalQual, qual-empiricalQual, N, B); n
|
//out.printf("%3d,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, dinuc, qual, empiricalQual, qual-empiricalQual, N, B); n
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,8 @@ package org.broadinstitute.sting.playground.gatk.walkers.recalibration;
|
||||||
import net.sf.samtools.*;
|
import net.sf.samtools.*;
|
||||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
import org.broadinstitute.sting.utils.*;
|
import org.broadinstitute.sting.utils.*;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
|
@ -12,6 +14,7 @@ import java.io.File;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
|
|
||||||
@WalkerName("LogisticRecalibration")
|
@WalkerName("LogisticRecalibration")
|
||||||
|
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||||
public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||||
@Argument(shortName="logisticParams", doc="logistic params file", required=true)
|
@Argument(shortName="logisticParams", doc="logistic params file", required=true)
|
||||||
public String logisticParamsFile;
|
public String logisticParamsFile;
|
||||||
|
|
@ -185,7 +188,7 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
|
||||||
}
|
}
|
||||||
|
|
||||||
if (read.getReadNegativeStrandFlag())
|
if (read.getReadNegativeStrandFlag())
|
||||||
recalQuals = BaseUtils.reverse(quals);
|
recalQuals = BaseUtils.reverse(recalQuals);
|
||||||
//System.out.printf("OLD: %s%n", read.format());
|
//System.out.printf("OLD: %s%n", read.format());
|
||||||
read.setBaseQualities(recalQuals);
|
read.setBaseQualities(recalQuals);
|
||||||
//System.out.printf("NEW: %s%n", read.format());
|
//System.out.printf("NEW: %s%n", read.format());
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers.recalibration;
|
package org.broadinstitute.sting.playground.gatk.walkers.recalibration;
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.QualityUtils;
|
import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
|
||||||
import org.broadinstitute.sting.utils.BaseUtils;
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
|
||||||
public class RecalData {
|
public class RecalData {
|
||||||
|
|
@ -53,7 +52,7 @@ public class RecalData {
|
||||||
}
|
}
|
||||||
|
|
||||||
public String toCSVString(boolean collapsedPos) {
|
public String toCSVString(boolean collapsedPos) {
|
||||||
return String.format("%s,%s,%d,%s,%d,%d", readGroup, dinuc, qual, collapsedPos ? "*" : pos, N, B);
|
return String.format("%s,%s,%d,%s,%d,%d,%d", readGroup, dinuc, qual, collapsedPos ? "*" : pos, N, B, empiricalQualByte());
|
||||||
}
|
}
|
||||||
|
|
||||||
public static RecalData fromCSVString(String s) {
|
public static RecalData fromCSVString(String s) {
|
||||||
|
|
@ -74,25 +73,26 @@ public class RecalData {
|
||||||
return datum;
|
return datum;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static int bases2dinucIndex(char prevBase, char base, boolean Complement) {
|
public static int bases2dinucIndex(char prevBase, char base) {
|
||||||
if ( BaseUtils.simpleBaseToBaseIndex(prevBase) == -1 || BaseUtils.simpleBaseToBaseIndex(base) == -1 )
|
int pbI = BaseUtils.simpleBaseToBaseIndex(prevBase);
|
||||||
return -1;
|
int bI = BaseUtils.simpleBaseToBaseIndex(base);
|
||||||
|
return (pbI == -1 || bI == -1) ? -1 : pbI * 4 + bI;
|
||||||
if (!Complement) {
|
|
||||||
return BaseUtils.simpleBaseToBaseIndex(prevBase) * 4 + BaseUtils.simpleBaseToBaseIndex(base);
|
|
||||||
}else {
|
|
||||||
return (3 - BaseUtils.simpleBaseToBaseIndex(prevBase)) * 4 + (3 - BaseUtils.simpleBaseToBaseIndex(base));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public final static int NDINUCS = 16;
|
||||||
public static String dinucIndex2bases(int index) {
|
public static String dinucIndex2bases(int index) {
|
||||||
char data[] = {BaseUtils.baseIndexToSimpleBase(index / 4), BaseUtils.baseIndexToSimpleBase(index % 4)};
|
char data[] = {BaseUtils.baseIndexToSimpleBase(index / 4), BaseUtils.baseIndexToSimpleBase(index % 4)};
|
||||||
return new String( data );
|
return new String( data );
|
||||||
}
|
}
|
||||||
|
|
||||||
public static int string2dinucIndex(String s) {
|
public static int dinucIndex(String s) {
|
||||||
return bases2dinucIndex(s.charAt(0), s.charAt(1), false);
|
return bases2dinucIndex(s.charAt(0), s.charAt(1));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static int dinucIndex(byte prevBase, byte base) {
|
||||||
|
return bases2dinucIndex((char)prevBase, (char)base);
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// private static int nuc2num[];
|
// private static int nuc2num[];
|
||||||
// private static char num2nuc[];
|
// private static char num2nuc[];
|
||||||
|
|
|
||||||
|
|
@ -33,12 +33,21 @@ public class RecalDataManager {
|
||||||
return trackPos ? pos : 0;
|
return trackPos ? pos : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getDinucIndex(String dinuc) {
|
public int canonicalPos(int cycle) {
|
||||||
if ( trackDinuc ) {
|
return getPosIndex(cycle);
|
||||||
return RecalData.string2dinucIndex(dinuc);
|
|
||||||
} else {
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public int getDinucIndex(String dinuc) {
|
||||||
|
return trackDinuc ? RecalData.dinucIndex(dinuc) : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getDinucIndex(byte prevBase, byte base) {
|
||||||
|
return trackDinuc ? RecalData.dinucIndex(prevBase, base) : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String canonicalDinuc(String dinuc) {
|
||||||
|
return trackDinuc ? dinuc : "**";
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addDatum(RecalData datum) {
|
public void addDatum(RecalData datum) {
|
||||||
|
|
|
||||||
|
|
@ -3,16 +3,21 @@ package org.broadinstitute.sting.playground.gatk.walkers.recalibration;
|
||||||
import net.sf.samtools.*;
|
import net.sf.samtools.*;
|
||||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
import org.broadinstitute.sting.utils.*;
|
import org.broadinstitute.sting.utils.*;
|
||||||
import org.broadinstitute.sting.playground.gatk.walkers.recalibration.RecalData;
|
import org.broadinstitute.sting.playground.gatk.walkers.recalibration.RecalData;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
|
|
||||||
@WalkerName("TableRecalibration")
|
@WalkerName("TableRecalibration")
|
||||||
|
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||||
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||||
@Argument(shortName="params", doc="CountCovariates params file", required=true)
|
@Argument(shortName="params", doc="CountCovariates params file", required=true)
|
||||||
public String paramsFile;
|
public String paramsFile;
|
||||||
|
|
@ -27,26 +32,52 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
// maps from [readGroup] -> [prevBase x base -> [cycle, qual, new qual]]
|
// maps from [readGroup] -> [prevBase x base -> [cycle, qual, new qual]]
|
||||||
HashMap<String, RecalMapping> cache = new HashMap<String, RecalMapping>();
|
HashMap<String, RecalMapping> cache = new HashMap<String, RecalMapping>();
|
||||||
|
|
||||||
@Argument(shortName="serial", doc="", required=false)
|
//@Argument(shortName="serial", doc="", required=false)
|
||||||
boolean serialRecalibration = false;
|
boolean serialRecalibration = false;
|
||||||
|
|
||||||
|
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||||
|
private static Pattern COLLAPSED_POS_PATTERN = Pattern.compile("^#\\s+collapsed_pos\\s+(\\w+)");
|
||||||
|
private static Pattern COLLAPSED_DINUC_PATTERN = Pattern.compile("^#\\s+collapsed_dinuc\\s+(\\w+)");
|
||||||
|
private static Pattern HEADER_PATTERN = Pattern.compile("^rg.*");
|
||||||
|
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
try {
|
try {
|
||||||
System.out.printf("Reading data...%n");
|
System.out.printf("Reading data...%n");
|
||||||
List<RecalData> data = new ArrayList<RecalData>();
|
List<RecalData> data = new ArrayList<RecalData>();
|
||||||
|
boolean collapsedPos = false;
|
||||||
|
boolean collapsedDinuc = false;
|
||||||
|
|
||||||
List<String> lines = new xReadLines(new File(paramsFile)).readLines();
|
List<String> lines = new xReadLines(new File(paramsFile)).readLines();
|
||||||
for ( String line : lines ) {
|
for ( String line : lines ) {
|
||||||
// rg,dn,logitQ,pos,indicator,count
|
//System.out.printf("Reading line %s%n", line);
|
||||||
// SRR007069,AA,28,1,0,2
|
if ( HEADER_PATTERN.matcher(line).matches() )
|
||||||
|
continue;
|
||||||
|
if ( COMMENT_PATTERN.matcher(line).matches() ) {
|
||||||
|
collapsedPos = parseCommentLine(COLLAPSED_POS_PATTERN, line, collapsedPos);
|
||||||
|
collapsedDinuc = parseCommentLine(COLLAPSED_DINUC_PATTERN, line, collapsedDinuc);
|
||||||
|
//System.out.printf("Collapsed %b %b%n", collapsedPos, collapsedDinuc);
|
||||||
|
}
|
||||||
|
else {
|
||||||
data.add(RecalData.fromCSVString(line));
|
data.add(RecalData.fromCSVString(line));
|
||||||
}
|
}
|
||||||
initializeCache(data);
|
}
|
||||||
|
initializeCache(data, collapsedPos, collapsedDinuc);
|
||||||
} catch ( FileNotFoundException e ) {
|
} catch ( FileNotFoundException e ) {
|
||||||
Utils.scareUser("Cannot read/find parameters file " + paramsFile);
|
Utils.scareUser("Cannot read/find parameters file " + paramsFile);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void initializeCache(List<RecalData> data) {
|
private boolean parseCommentLine(Pattern pat, String line, boolean flag) {
|
||||||
|
Matcher m = pat.matcher(line);
|
||||||
|
if ( m.matches() ) {
|
||||||
|
//System.out.printf("Parsing %s%n", m.group(1));
|
||||||
|
flag = Boolean.parseBoolean(m.group(1));
|
||||||
|
}
|
||||||
|
|
||||||
|
return flag;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initializeCache(List<RecalData> data, boolean collapsedPos, boolean collapsedDinuc ) {
|
||||||
Set<String> readGroups = new HashSet<String>();
|
Set<String> readGroups = new HashSet<String>();
|
||||||
Set<String> dinucs = new HashSet<String>();
|
Set<String> dinucs = new HashSet<String>();
|
||||||
int maxPos = -1;
|
int maxPos = -1;
|
||||||
|
|
@ -68,7 +99,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
// initialize the data structure
|
// initialize the data structure
|
||||||
HashMap<String, RecalDataManager> managers = new HashMap<String, RecalDataManager>();
|
HashMap<String, RecalDataManager> managers = new HashMap<String, RecalDataManager>();
|
||||||
for ( String readGroup : readGroups ) {
|
for ( String readGroup : readGroups ) {
|
||||||
RecalDataManager manager = new RecalDataManager(readGroup, maxPos, maxQReported, dinucs.size(), true, true);
|
RecalDataManager manager = new RecalDataManager(readGroup, maxPos, maxQReported, dinucs.size(), ! collapsedPos, ! collapsedDinuc);
|
||||||
managers.put(readGroup, manager);
|
managers.put(readGroup, manager);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -90,13 +121,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
}
|
}
|
||||||
|
|
||||||
public SAMRecord map(char[] ref, SAMRecord read) {
|
public SAMRecord map(char[] ref, SAMRecord read) {
|
||||||
//if ( read.getReadLength() > maxReadLen ) {
|
|
||||||
// throw new RuntimeException("Expectedly long read, please increase maxium read len with maxReadLen parameter: " + read.format());
|
|
||||||
//}
|
|
||||||
|
|
||||||
byte[] bases = read.getReadBases();
|
byte[] bases = read.getReadBases();
|
||||||
byte[] quals = read.getBaseQualities();
|
byte[] quals = read.getBaseQualities();
|
||||||
byte[] recalQuals = new byte[quals.length];
|
|
||||||
|
|
||||||
// Since we want machine direction reads not corrected positive strand reads, rev comp any negative strand reads
|
// Since we want machine direction reads not corrected positive strand reads, rev comp any negative strand reads
|
||||||
if (read.getReadNegativeStrandFlag()) {
|
if (read.getReadNegativeStrandFlag()) {
|
||||||
|
|
@ -104,28 +130,34 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
quals = BaseUtils.reverse(quals);
|
quals = BaseUtils.reverse(quals);
|
||||||
}
|
}
|
||||||
|
|
||||||
String readGroup = read.getAttribute("RG").toString();
|
byte[] recalQuals = recalibrateBasesAndQuals(read.getAttribute("RG").toString(), bases, quals);
|
||||||
|
|
||||||
|
if (read.getReadNegativeStrandFlag())
|
||||||
|
recalQuals = BaseUtils.reverse(recalQuals);
|
||||||
|
//if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") )
|
||||||
|
// System.out.printf("OLD: %s%n", read.format());
|
||||||
|
read.setBaseQualities(recalQuals);
|
||||||
|
//if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") )
|
||||||
|
// System.out.printf("NEW: %s%n", read.format());
|
||||||
|
return read;
|
||||||
|
}
|
||||||
|
|
||||||
|
public byte[] recalibrateBasesAndQuals(final String readGroup, byte[] bases, byte[] quals) {
|
||||||
|
byte[] recalQuals = new byte[quals.length];
|
||||||
RecalMapping mapper = cache.get(readGroup);
|
RecalMapping mapper = cache.get(readGroup);
|
||||||
|
|
||||||
int numBases = read.getReadLength();
|
|
||||||
recalQuals[0] = quals[0]; // can't change the first -- no dinuc
|
recalQuals[0] = quals[0]; // can't change the first -- no dinuc
|
||||||
|
for ( int cycle = 1; cycle < bases.length; cycle++ ) { // skip first and last base, qual already set because no dinuc
|
||||||
for ( int cycle = 1; cycle < numBases; cycle++ ) { // skip first and last base, qual already set because no dinuc
|
|
||||||
// Take into account that previous base is the next base in terms of machine chemistry if
|
|
||||||
// this is a negative strand
|
|
||||||
byte qual = quals[cycle];
|
byte qual = quals[cycle];
|
||||||
byte newQual = mapper.getNewQual(readGroup, bases[cycle - 1], bases[cycle], cycle, qual);
|
byte newQual = mapper.getNewQual(readGroup, bases[cycle - 1], bases[cycle], cycle, qual);
|
||||||
|
//if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") )
|
||||||
|
// System.out.printf("Processing cycle=%d qual=%d: neg?=%b => %d at %s%n",
|
||||||
|
// cycle, qual, read.getReadNegativeStrandFlag(), newQual, read.getReadName());
|
||||||
recalQuals[cycle] = newQual;
|
recalQuals[cycle] = newQual;
|
||||||
//System.out.printf("Mapping %d => %d%n", qual, newQual);
|
//System.out.printf("Mapping %d => %d%n", qual, newQual);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (read.getReadNegativeStrandFlag())
|
return recalQuals;
|
||||||
recalQuals = BaseUtils.reverse(quals);
|
|
||||||
//System.out.printf("OLD: %s%n", read.format());
|
|
||||||
read.setBaseQualities(recalQuals);
|
|
||||||
//System.out.printf("NEW: %s%n", read.format());
|
|
||||||
return read;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void onTraversalDone(SAMFileWriter output) {
|
public void onTraversalDone(SAMFileWriter output) {
|
||||||
|
|
@ -164,9 +196,58 @@ interface RecalMapping {
|
||||||
}
|
}
|
||||||
|
|
||||||
class CombinatorialRecalMapping implements RecalMapping {
|
class CombinatorialRecalMapping implements RecalMapping {
|
||||||
HashMap<String, byte[][]> cache = new HashMap<String, byte[][]>();
|
ArrayList<byte[][]> cache;
|
||||||
|
RecalDataManager manager;
|
||||||
|
|
||||||
public CombinatorialRecalMapping(RecalDataManager manager, Set<String> dinucs, int maxPos, int maxQReported ) {
|
public CombinatorialRecalMapping(RecalDataManager manager, Set<String> dinucs, int maxPos, int maxQReported ) {
|
||||||
|
this.manager = manager;
|
||||||
|
|
||||||
|
// initialize the data structure
|
||||||
|
cache = new ArrayList<byte[][]>(RecalData.NDINUCS);
|
||||||
|
for ( String dinuc : dinucs ) {
|
||||||
|
cache.add(new byte[maxPos+1][maxQReported+1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( RecalData datum : manager.getAll() ) {
|
||||||
|
//System.out.printf("Adding datum %s%n", datum);
|
||||||
|
byte [][] table = cache.get(this.manager.getDinucIndex(datum.dinuc));
|
||||||
|
if ( table[datum.pos][datum.qual] != 0 )
|
||||||
|
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
|
||||||
|
//table[datum.pos][datum.qual] = (byte)(1 + datum.empiricalQualByte());
|
||||||
|
table[datum.pos][datum.qual] = datum.empiricalQualByte();
|
||||||
|
// System.out.printf("Binding %d %d => %d%n", datum.pos, datum.qual, datum.empiricalQualByte());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) {
|
||||||
|
//String dinuc = String.format("%c%c", (char)prevBase, (char)base);
|
||||||
|
//if ( qual == 2 )
|
||||||
|
// System.out.printf("Qual = 2%n");
|
||||||
|
|
||||||
|
int pos = manager.canonicalPos(cycle);
|
||||||
|
int index = this.manager.getDinucIndex(prevBase, base);
|
||||||
|
byte[][] dataTable = index == -1 ? null : cache.get(index);
|
||||||
|
|
||||||
|
if ( dataTable == null && prevBase != 'N' && base != 'N' )
|
||||||
|
throw new RuntimeException(String.format("Unmapped data table at %s %c%c", readGroup, prevBase, base));
|
||||||
|
|
||||||
|
byte result = dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual;
|
||||||
|
|
||||||
|
//if ( result == 2 )
|
||||||
|
// System.out.printf("Lookup RG=%s dinuc=%s cycle=%d pos=%d qual=%d datatable=%s / %d => %d%n",
|
||||||
|
// readGroup, dinuc, cycle, pos, qual, dataTable, dataTable.length, result);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*class CombinatorialRecalMapping implements RecalMapping {
|
||||||
|
HashMap<String, byte[][]> cache = new HashMap<String, byte[][]>();
|
||||||
|
RecalDataManager manager;
|
||||||
|
|
||||||
|
public CombinatorialRecalMapping(RecalDataManager manager, Set<String> dinucs, int maxPos, int maxQReported ) {
|
||||||
|
this.manager = manager;
|
||||||
|
|
||||||
// initialize the data structure
|
// initialize the data structure
|
||||||
for ( String dinuc : dinucs ) {
|
for ( String dinuc : dinucs ) {
|
||||||
byte[][] table = new byte[maxPos+1][maxQReported+1];
|
byte[][] table = new byte[maxPos+1][maxQReported+1];
|
||||||
|
|
@ -180,22 +261,32 @@ class CombinatorialRecalMapping implements RecalMapping {
|
||||||
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
|
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
|
||||||
//table[datum.pos][datum.qual] = (byte)(1 + datum.empiricalQualByte());
|
//table[datum.pos][datum.qual] = (byte)(1 + datum.empiricalQualByte());
|
||||||
table[datum.pos][datum.qual] = datum.empiricalQualByte();
|
table[datum.pos][datum.qual] = datum.empiricalQualByte();
|
||||||
|
// System.out.printf("Binding %d %d => %d%n", datum.pos, datum.qual, datum.empiricalQualByte());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) {
|
public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) {
|
||||||
//System.out.printf("Lookup RG=%s prevBase=%c base=%c cycle=%d qual=%d%n", readGroup, prevBase, base, cycle, qual);
|
|
||||||
//String dinuc = String.format("%c%c", (char)prevBase, (char)base);
|
//String dinuc = String.format("%c%c", (char)prevBase, (char)base);
|
||||||
|
//if ( qual == 2 )
|
||||||
|
// System.out.printf("Qual = 2%n");
|
||||||
|
|
||||||
byte[] bp = {prevBase, base};
|
byte[] bp = {prevBase, base};
|
||||||
String dinuc = new String(bp);
|
String dinuc = manager.canonicalDinuc(new String(bp));
|
||||||
|
int pos = manager.canonicalPos(cycle);
|
||||||
byte[][] dataTable = cache.get(dinuc);
|
byte[][] dataTable = cache.get(dinuc);
|
||||||
|
|
||||||
if ( dataTable == null && prevBase != 'N' && base != 'N' )
|
if ( dataTable == null && prevBase != 'N' && base != 'N' )
|
||||||
throw new RuntimeException(String.format("Unmapped data table at %s %s", readGroup, dinuc));
|
throw new RuntimeException(String.format("Unmapped data table at %s %s", readGroup, dinuc));
|
||||||
|
|
||||||
return dataTable != null && cycle < dataTable.length ? dataTable[cycle][qual] : qual;
|
byte result = dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual;
|
||||||
}
|
|
||||||
|
//if ( result == 2 )
|
||||||
|
// System.out.printf("Lookup RG=%s dinuc=%s cycle=%d pos=%d qual=%d datatable=%s / %d => %d%n",
|
||||||
|
// readGroup, dinuc, cycle, pos, qual, dataTable, dataTable.length, result);
|
||||||
|
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
}*/
|
||||||
|
|
||||||
class SerialRecalMapping implements RecalMapping {
|
class SerialRecalMapping implements RecalMapping {
|
||||||
// mapping from dinuc x Q => new Q
|
// mapping from dinuc x Q => new Q
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue