More tables output by CovariateCounterWalker AND made CovariateCounterWalker and LogisticRecalibration aware of positive and negative strandedness of data which changes the regression output significantly.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@568 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
andrewk 2009-04-30 01:22:50 +00:00
parent f7a877bfeb
commit b630f2f2f1
2 changed files with 107 additions and 27 deletions

View File

@ -21,8 +21,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
@Argument(fullName="MAX_QUAL_SCORE", shortName="mqs", required=false,defaultValue="63")
public int MAX_QUAL_SCORE;
@Argument(fullName="LOG_REG_TRAINING_FILEROOT", shortName="lrtf", required=false, defaultValue="dinuc", doc="Filename root for the outputted logistic regression training files")
public String LOG_REG_TRAINING_FILEROOT;
@Argument(fullName="OUTPUT_FILEROOT", shortName="outroot", required=false, defaultValue="output", doc="Filename root for the outputted logistic regression training files")
public String OUTPUT_FILEROOT;
@Argument(fullName="CREATE_TRAINING_DATA", shortName="trainingdata", required=false, defaultValue="false", doc="Create training data files for logistic regression")
public boolean CREATE_TRAINING_DATA;
int NDINUCS = 16;
RecalData[][][] data = new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS];
@ -35,7 +38,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
String dinuc_root = "dinuc";
ArrayList<PrintStream> dinuc_outs = new ArrayList<PrintStream>();
PrintStream covars_out = new PrintStream("covars.out");
PrintStream ByQual = new PrintStream("quality_empirical_vs_observed.csv");
PrintStream ByQualFile; // = new PrintStream("quality_empirical_vs_observed.csv");
PrintStream ByCycleFile;
PrintStream ByDinucFile;
private class RecalData {
long N;
@ -93,18 +98,23 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
for (int i =0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
if ( offset > 0 ) {
int numBases = read.getReadLength();
if ( offset > 0 && offset < (numBases-1) ) { // skip first and last bases because they suck and they don't have a dinuc count
char base = (char)read.getReadBases()[offset];
char prevBase = (char)read.getReadBases()[offset-1];
int qual = (int)read.getBaseQualities()[offset];
//out.printf("%d %d %d\n", offset, qual, bases2dinucIndex(prevBase,base));
// previous base is the next base in terms of machine chemistry if this is a negative strand
char prevBase = (char)read.getReadBases()[offset + (read.getReadNegativeStrandFlag() ? 1 : -1)];
int dinuc_index = bases2dinucIndex(prevBase,base);
if (qual > 0 && qual <= MAX_QUAL_SCORE) {
RecalData datum = data[offset][qual][dinuc_index];
datum.inc(base,ref);
dinuc_outs.get(dinuc_index).format("%d,%d,%d\n", qual, offset, nuc2num[base]==nuc2num[ref] ? 0 : 1);
// Convert offset into cycle position which means reversing the position of reads on the negative strand
int cycle = read.getReadNegativeStrandFlag() ? numBases - offset - 1 : offset;
data[cycle][qual][dinuc_index].inc(base,ref);
if (CREATE_TRAINING_DATA)
dinuc_outs.get(dinuc_index).format("%d,%d,%d\n", qual, cycle, nuc2num[base]==nuc2num[ref] ? 0 : 1);
}
}
}
@ -120,17 +130,77 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
}
qualityEmpiricalObserved();
qualityDiffVsCycle();
qualityDiffVsDinucleotide();
// Close dinuc filehandles
for ( PrintStream dinuc_stream : this.dinuc_outs)
dinuc_stream.close();
if (CREATE_TRAINING_DATA)
for ( PrintStream dinuc_stream : this.dinuc_outs)
dinuc_stream.close();
}
class MeanReportedQuality {
double Qn = 0;
long n = 0;
void inc(double Q, long n) {
this.n += n;
Qn += Q * n;
}
double result() {
return Qn / n;
}
}
public void qualityDiffVsCycle() {
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
ByCycleFile.printf("cycle,Qemp-obs,Qemp,Qobs,B,N%n");
for (int c=0; c < MAX_READ_LENGTH+1; c++) {
ByCycle.add(new RecalData(c, -1, "-"));
ByCycleReportedQ.add(new MeanReportedQuality());
}
for ( RecalData datum: flattenData ) {
ByCycle.get(datum.pos).inc(datum.N, datum.B);
ByCycleReportedQ.get(datum.pos).inc(datum.qual, datum.N);
}
for (int c=0; c < MAX_READ_LENGTH+1; c++) {
double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N);
double reportedQual = ByCycleReportedQ.get(c).result();
ByCycleFile.printf("%d, %f, %f, %f, %d, %d%n", c, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
}
}
public void qualityDiffVsDinucleotide() {
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
ByDinucFile.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n");
for (int c=0; c < NDINUCS; c++) {
ByCycle.add(new RecalData(-1, -1, dinucIndex2bases(c)));
ByCycleReportedQ.add(new MeanReportedQuality());
}
for ( RecalData datum: flattenData ) {
int dinucIndex = bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1));
ByCycle.get(dinucIndex).inc(datum.N, datum.B);
ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N);
}
for (int c=0; c < NDINUCS; c++) {
double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N);
double reportedQual = ByCycleReportedQ.get(c).result();
ByDinucFile.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
}
}
public void qualityEmpiricalObserved() {
ArrayList<RecalData> ByQ = new ArrayList<RecalData>();
ByQual.printf("Qcal, Qobs, B, N%n");
ByQualFile.printf("Qrep,Qemp,B,N%n");
for (int q=0; q<MAX_QUAL_SCORE+1; q++)
ByQ.add(new RecalData(-1,q,"-"));
@ -140,7 +210,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
for (int q=0; q<MAX_QUAL_SCORE; q++) {
double empiricalQual = -10 * Math.log10((double)ByQ.get(q).B / ByQ.get(q).N);
ByQual.printf("%d, %f, %d, %d%n", q, empiricalQual, ByQ.get(q).B, ByQ.get(q).N);
ByQualFile.printf("%d, %f, %d, %d%n", q, empiricalQual, ByQ.get(q).B, ByQ.get(q).N);
//out.printf("%3d,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, dinuc, qual, empiricalQual, qual-empiricalQual, N, B);
}
}
@ -182,11 +252,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
// Print out data for regression
public CovariateCounterWalker() throws FileNotFoundException {
for ( int i=0; i<NDINUCS; i++) {
PrintStream next_dinuc = new PrintStream( dinuc_root+"."+dinucIndex2bases(i)+".csv");
next_dinuc.println("logitQ,pos,indicator");
dinuc_outs.add(next_dinuc);
}
ByQual = new PrintStream("by_quality.csv");
if (CREATE_TRAINING_DATA)
for ( int i=0; i<NDINUCS; i++) {
PrintStream next_dinuc = new PrintStream( dinuc_root+"."+dinucIndex2bases(i)+".csv");
next_dinuc.println("logitQ,pos,indicator");
dinuc_outs.add(next_dinuc);
}
ByQualFile = new PrintStream(OUTPUT_FILEROOT+".empirical_v_reported_quality.csv");
ByCycleFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_cycle.csv");
ByDinucFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_dinucleotide.csv");
}
}

View File

@ -96,25 +96,32 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
byte[] quals = read.getBaseQualities();
byte[] recalQuals = new byte[quals.length];
int numBases = read.getReadLength();
recalQuals[0] = quals[0]; // can't change the first -- no dinuc
for ( int i = 1; i < bases.length; i++ ) {
int cycle = i;
recalQuals[numBases-1] = quals[numBases-1]; // can't change last -- no dinuc
for ( int i = 1; i < numBases-1; i++ ) { // skip first and last base, qual already set because no dinuc
// Take into account that previous base is the next base in terms of machine chemistry if this is a negative strand
int cycle = read.getReadNegativeStrandFlag() ? numBases - i - 1 : i;
String dinuc = String.format("%c%c", bases[i + (read.getReadNegativeStrandFlag() ? 1 : -1)], bases[i]);
byte qual = quals[i];
String dinuc = String.format("%c%c", bases[i-1], bases[i]);
//System.out.printf("dinuc %c %c%n", bases[i-1], bases[i]);
LogisticRegressor regressor = regressors.get(dinuc);
double P = -1;
byte newQual = qual;
byte newQual;
if ( regressor != null ) { // no N or some other unexpected bp in the stream
double logPOver1minusP = regressor.regress((double)cycle+1, (double)qual);
double gamma = regressor.regress((double)cycle+1, (double)qual);
double expGamma = Math.exp(gamma);
double finalP = expGamma / (1+expGamma);
newQual = QualityUtils.probToQual(1-finalP);
//newQual = -10 * Math.round(logPOver1minusP)
/*double POver1minusP = Math.pow(10, logPOver1minusP);
P = POver1minusP / (1 + POver1minusP);
newQual = QualityUtils.probToQual(P);*/
P = POver1minusP / (1 + POver1minusP);*/
//newQual = QualityUtils.probToQual(P);
newQual = (byte)Math.min(Math.round(-10*logPOver1minusP),63);
//newQual = (byte)Math.min(Math.round(-10*logPOver1minusP),63);
//System.out.printf("Recal %s %d %d => %f => %f leads to %d%n", dinuc, cycle, qual, logPOver1minusP, P, newQual);
}else{
newQual = qual;
}
recalQuals[i] = newQual;