More tables output by CovariateCounterWalker AND made CovariateCounterWalker and LogisticRecalibration aware of positive and negative strandedness of data which changes the regression output significantly.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@568 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
f7a877bfeb
commit
b630f2f2f1
|
|
@ -21,8 +21,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
@Argument(fullName="MAX_QUAL_SCORE", shortName="mqs", required=false,defaultValue="63")
|
||||
public int MAX_QUAL_SCORE;
|
||||
|
||||
@Argument(fullName="LOG_REG_TRAINING_FILEROOT", shortName="lrtf", required=false, defaultValue="dinuc", doc="Filename root for the outputted logistic regression training files")
|
||||
public String LOG_REG_TRAINING_FILEROOT;
|
||||
@Argument(fullName="OUTPUT_FILEROOT", shortName="outroot", required=false, defaultValue="output", doc="Filename root for the outputted logistic regression training files")
|
||||
public String OUTPUT_FILEROOT;
|
||||
|
||||
@Argument(fullName="CREATE_TRAINING_DATA", shortName="trainingdata", required=false, defaultValue="false", doc="Create training data files for logistic regression")
|
||||
public boolean CREATE_TRAINING_DATA;
|
||||
|
||||
int NDINUCS = 16;
|
||||
RecalData[][][] data = new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS];
|
||||
|
|
@ -35,7 +38,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
String dinuc_root = "dinuc";
|
||||
ArrayList<PrintStream> dinuc_outs = new ArrayList<PrintStream>();
|
||||
PrintStream covars_out = new PrintStream("covars.out");
|
||||
PrintStream ByQual = new PrintStream("quality_empirical_vs_observed.csv");
|
||||
PrintStream ByQualFile; // = new PrintStream("quality_empirical_vs_observed.csv");
|
||||
PrintStream ByCycleFile;
|
||||
PrintStream ByDinucFile;
|
||||
|
||||
private class RecalData {
|
||||
long N;
|
||||
|
|
@ -93,18 +98,23 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
for (int i =0; i < reads.size(); i++ ) {
|
||||
|
||||
SAMRecord read = reads.get(i);
|
||||
int offset = offsets.get(i);
|
||||
if ( offset > 0 ) {
|
||||
int numBases = read.getReadLength();
|
||||
if ( offset > 0 && offset < (numBases-1) ) { // skip first and last bases because they suck and they don't have a dinuc count
|
||||
char base = (char)read.getReadBases()[offset];
|
||||
char prevBase = (char)read.getReadBases()[offset-1];
|
||||
int qual = (int)read.getBaseQualities()[offset];
|
||||
//out.printf("%d %d %d\n", offset, qual, bases2dinucIndex(prevBase,base));
|
||||
// previous base is the next base in terms of machine chemistry if this is a negative strand
|
||||
char prevBase = (char)read.getReadBases()[offset + (read.getReadNegativeStrandFlag() ? 1 : -1)];
|
||||
int dinuc_index = bases2dinucIndex(prevBase,base);
|
||||
if (qual > 0 && qual <= MAX_QUAL_SCORE) {
|
||||
RecalData datum = data[offset][qual][dinuc_index];
|
||||
datum.inc(base,ref);
|
||||
dinuc_outs.get(dinuc_index).format("%d,%d,%d\n", qual, offset, nuc2num[base]==nuc2num[ref] ? 0 : 1);
|
||||
// Convert offset into cycle position which means reversing the position of reads on the negative strand
|
||||
int cycle = read.getReadNegativeStrandFlag() ? numBases - offset - 1 : offset;
|
||||
data[cycle][qual][dinuc_index].inc(base,ref);
|
||||
if (CREATE_TRAINING_DATA)
|
||||
dinuc_outs.get(dinuc_index).format("%d,%d,%d\n", qual, cycle, nuc2num[base]==nuc2num[ref] ? 0 : 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -120,17 +130,77 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
qualityEmpiricalObserved();
|
||||
qualityDiffVsCycle();
|
||||
qualityDiffVsDinucleotide();
|
||||
|
||||
// Close dinuc filehandles
|
||||
for ( PrintStream dinuc_stream : this.dinuc_outs)
|
||||
dinuc_stream.close();
|
||||
if (CREATE_TRAINING_DATA)
|
||||
for ( PrintStream dinuc_stream : this.dinuc_outs)
|
||||
dinuc_stream.close();
|
||||
|
||||
}
|
||||
|
||||
class MeanReportedQuality {
|
||||
double Qn = 0;
|
||||
long n = 0;
|
||||
|
||||
void inc(double Q, long n) {
|
||||
this.n += n;
|
||||
Qn += Q * n;
|
||||
}
|
||||
|
||||
double result() {
|
||||
return Qn / n;
|
||||
}
|
||||
}
|
||||
|
||||
public void qualityDiffVsCycle() {
|
||||
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
|
||||
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
|
||||
ByCycleFile.printf("cycle,Qemp-obs,Qemp,Qobs,B,N%n");
|
||||
for (int c=0; c < MAX_READ_LENGTH+1; c++) {
|
||||
ByCycle.add(new RecalData(c, -1, "-"));
|
||||
ByCycleReportedQ.add(new MeanReportedQuality());
|
||||
}
|
||||
|
||||
for ( RecalData datum: flattenData ) {
|
||||
ByCycle.get(datum.pos).inc(datum.N, datum.B);
|
||||
ByCycleReportedQ.get(datum.pos).inc(datum.qual, datum.N);
|
||||
}
|
||||
|
||||
for (int c=0; c < MAX_READ_LENGTH+1; c++) {
|
||||
double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N);
|
||||
double reportedQual = ByCycleReportedQ.get(c).result();
|
||||
ByCycleFile.printf("%d, %f, %f, %f, %d, %d%n", c, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
|
||||
}
|
||||
}
|
||||
|
||||
public void qualityDiffVsDinucleotide() {
|
||||
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
|
||||
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
|
||||
ByDinucFile.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n");
|
||||
for (int c=0; c < NDINUCS; c++) {
|
||||
ByCycle.add(new RecalData(-1, -1, dinucIndex2bases(c)));
|
||||
ByCycleReportedQ.add(new MeanReportedQuality());
|
||||
}
|
||||
|
||||
for ( RecalData datum: flattenData ) {
|
||||
int dinucIndex = bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1));
|
||||
ByCycle.get(dinucIndex).inc(datum.N, datum.B);
|
||||
ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N);
|
||||
}
|
||||
|
||||
for (int c=0; c < NDINUCS; c++) {
|
||||
double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N);
|
||||
double reportedQual = ByCycleReportedQ.get(c).result();
|
||||
ByDinucFile.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
|
||||
}
|
||||
}
|
||||
|
||||
public void qualityEmpiricalObserved() {
|
||||
|
||||
ArrayList<RecalData> ByQ = new ArrayList<RecalData>();
|
||||
ByQual.printf("Qcal, Qobs, B, N%n");
|
||||
ByQualFile.printf("Qrep,Qemp,B,N%n");
|
||||
for (int q=0; q<MAX_QUAL_SCORE+1; q++)
|
||||
ByQ.add(new RecalData(-1,q,"-"));
|
||||
|
||||
|
|
@ -140,7 +210,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
|
||||
for (int q=0; q<MAX_QUAL_SCORE; q++) {
|
||||
double empiricalQual = -10 * Math.log10((double)ByQ.get(q).B / ByQ.get(q).N);
|
||||
ByQual.printf("%d, %f, %d, %d%n", q, empiricalQual, ByQ.get(q).B, ByQ.get(q).N);
|
||||
ByQualFile.printf("%d, %f, %d, %d%n", q, empiricalQual, ByQ.get(q).B, ByQ.get(q).N);
|
||||
//out.printf("%3d,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, dinuc, qual, empiricalQual, qual-empiricalQual, N, B);
|
||||
}
|
||||
}
|
||||
|
|
@ -182,11 +252,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
|
||||
// Print out data for regression
|
||||
public CovariateCounterWalker() throws FileNotFoundException {
|
||||
for ( int i=0; i<NDINUCS; i++) {
|
||||
PrintStream next_dinuc = new PrintStream( dinuc_root+"."+dinucIndex2bases(i)+".csv");
|
||||
next_dinuc.println("logitQ,pos,indicator");
|
||||
dinuc_outs.add(next_dinuc);
|
||||
}
|
||||
ByQual = new PrintStream("by_quality.csv");
|
||||
if (CREATE_TRAINING_DATA)
|
||||
for ( int i=0; i<NDINUCS; i++) {
|
||||
PrintStream next_dinuc = new PrintStream( dinuc_root+"."+dinucIndex2bases(i)+".csv");
|
||||
next_dinuc.println("logitQ,pos,indicator");
|
||||
dinuc_outs.add(next_dinuc);
|
||||
}
|
||||
ByQualFile = new PrintStream(OUTPUT_FILEROOT+".empirical_v_reported_quality.csv");
|
||||
ByCycleFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_cycle.csv");
|
||||
ByDinucFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_dinucleotide.csv");
|
||||
}
|
||||
}
|
||||
|
|
@ -96,25 +96,32 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
|
|||
byte[] quals = read.getBaseQualities();
|
||||
byte[] recalQuals = new byte[quals.length];
|
||||
|
||||
int numBases = read.getReadLength();
|
||||
recalQuals[0] = quals[0]; // can't change the first -- no dinuc
|
||||
for ( int i = 1; i < bases.length; i++ ) {
|
||||
int cycle = i;
|
||||
recalQuals[numBases-1] = quals[numBases-1]; // can't change last -- no dinuc
|
||||
for ( int i = 1; i < numBases-1; i++ ) { // skip first and last base, qual already set because no dinuc
|
||||
// Take into account that previous base is the next base in terms of machine chemistry if this is a negative strand
|
||||
int cycle = read.getReadNegativeStrandFlag() ? numBases - i - 1 : i;
|
||||
String dinuc = String.format("%c%c", bases[i + (read.getReadNegativeStrandFlag() ? 1 : -1)], bases[i]);
|
||||
byte qual = quals[i];
|
||||
String dinuc = String.format("%c%c", bases[i-1], bases[i]);
|
||||
//System.out.printf("dinuc %c %c%n", bases[i-1], bases[i]);
|
||||
LogisticRegressor regressor = regressors.get(dinuc);
|
||||
double P = -1;
|
||||
byte newQual = qual;
|
||||
byte newQual;
|
||||
|
||||
if ( regressor != null ) { // no N or some other unexpected bp in the stream
|
||||
double logPOver1minusP = regressor.regress((double)cycle+1, (double)qual);
|
||||
double gamma = regressor.regress((double)cycle+1, (double)qual);
|
||||
double expGamma = Math.exp(gamma);
|
||||
double finalP = expGamma / (1+expGamma);
|
||||
newQual = QualityUtils.probToQual(1-finalP);
|
||||
//newQual = -10 * Math.round(logPOver1minusP)
|
||||
/*double POver1minusP = Math.pow(10, logPOver1minusP);
|
||||
P = POver1minusP / (1 + POver1minusP);
|
||||
newQual = QualityUtils.probToQual(P);*/
|
||||
P = POver1minusP / (1 + POver1minusP);*/
|
||||
//newQual = QualityUtils.probToQual(P);
|
||||
|
||||
newQual = (byte)Math.min(Math.round(-10*logPOver1minusP),63);
|
||||
//newQual = (byte)Math.min(Math.round(-10*logPOver1minusP),63);
|
||||
//System.out.printf("Recal %s %d %d => %f => %f leads to %d%n", dinuc, cycle, qual, logPOver1minusP, P, newQual);
|
||||
}else{
|
||||
newQual = qual;
|
||||
}
|
||||
|
||||
recalQuals[i] = newQual;
|
||||
|
|
|
|||
Loading…
Reference in New Issue