More tables output by CovariateCounterWalker AND made CovariateCounterWalker and LogisticRecalibration aware of positive and negative strandedness of data which changes the regression output significantly.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@568 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
f7a877bfeb
commit
b630f2f2f1
|
|
@ -21,8 +21,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
@Argument(fullName="MAX_QUAL_SCORE", shortName="mqs", required=false,defaultValue="63")
|
@Argument(fullName="MAX_QUAL_SCORE", shortName="mqs", required=false,defaultValue="63")
|
||||||
public int MAX_QUAL_SCORE;
|
public int MAX_QUAL_SCORE;
|
||||||
|
|
||||||
@Argument(fullName="LOG_REG_TRAINING_FILEROOT", shortName="lrtf", required=false, defaultValue="dinuc", doc="Filename root for the outputted logistic regression training files")
|
@Argument(fullName="OUTPUT_FILEROOT", shortName="outroot", required=false, defaultValue="output", doc="Filename root for the outputted logistic regression training files")
|
||||||
public String LOG_REG_TRAINING_FILEROOT;
|
public String OUTPUT_FILEROOT;
|
||||||
|
|
||||||
|
@Argument(fullName="CREATE_TRAINING_DATA", shortName="trainingdata", required=false, defaultValue="false", doc="Create training data files for logistic regression")
|
||||||
|
public boolean CREATE_TRAINING_DATA;
|
||||||
|
|
||||||
int NDINUCS = 16;
|
int NDINUCS = 16;
|
||||||
RecalData[][][] data = new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS];
|
RecalData[][][] data = new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS];
|
||||||
|
|
@ -35,7 +38,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
String dinuc_root = "dinuc";
|
String dinuc_root = "dinuc";
|
||||||
ArrayList<PrintStream> dinuc_outs = new ArrayList<PrintStream>();
|
ArrayList<PrintStream> dinuc_outs = new ArrayList<PrintStream>();
|
||||||
PrintStream covars_out = new PrintStream("covars.out");
|
PrintStream covars_out = new PrintStream("covars.out");
|
||||||
PrintStream ByQual = new PrintStream("quality_empirical_vs_observed.csv");
|
PrintStream ByQualFile; // = new PrintStream("quality_empirical_vs_observed.csv");
|
||||||
|
PrintStream ByCycleFile;
|
||||||
|
PrintStream ByDinucFile;
|
||||||
|
|
||||||
private class RecalData {
|
private class RecalData {
|
||||||
long N;
|
long N;
|
||||||
|
|
@ -93,18 +98,23 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
List<SAMRecord> reads = context.getReads();
|
List<SAMRecord> reads = context.getReads();
|
||||||
List<Integer> offsets = context.getOffsets();
|
List<Integer> offsets = context.getOffsets();
|
||||||
for (int i =0; i < reads.size(); i++ ) {
|
for (int i =0; i < reads.size(); i++ ) {
|
||||||
|
|
||||||
SAMRecord read = reads.get(i);
|
SAMRecord read = reads.get(i);
|
||||||
int offset = offsets.get(i);
|
int offset = offsets.get(i);
|
||||||
if ( offset > 0 ) {
|
int numBases = read.getReadLength();
|
||||||
|
if ( offset > 0 && offset < (numBases-1) ) { // skip first and last bases because they suck and they don't have a dinuc count
|
||||||
char base = (char)read.getReadBases()[offset];
|
char base = (char)read.getReadBases()[offset];
|
||||||
char prevBase = (char)read.getReadBases()[offset-1];
|
|
||||||
int qual = (int)read.getBaseQualities()[offset];
|
int qual = (int)read.getBaseQualities()[offset];
|
||||||
//out.printf("%d %d %d\n", offset, qual, bases2dinucIndex(prevBase,base));
|
//out.printf("%d %d %d\n", offset, qual, bases2dinucIndex(prevBase,base));
|
||||||
|
// previous base is the next base in terms of machine chemistry if this is a negative strand
|
||||||
|
char prevBase = (char)read.getReadBases()[offset + (read.getReadNegativeStrandFlag() ? 1 : -1)];
|
||||||
int dinuc_index = bases2dinucIndex(prevBase,base);
|
int dinuc_index = bases2dinucIndex(prevBase,base);
|
||||||
if (qual > 0 && qual <= MAX_QUAL_SCORE) {
|
if (qual > 0 && qual <= MAX_QUAL_SCORE) {
|
||||||
RecalData datum = data[offset][qual][dinuc_index];
|
// Convert offset into cycle position which means reversing the position of reads on the negative strand
|
||||||
datum.inc(base,ref);
|
int cycle = read.getReadNegativeStrandFlag() ? numBases - offset - 1 : offset;
|
||||||
dinuc_outs.get(dinuc_index).format("%d,%d,%d\n", qual, offset, nuc2num[base]==nuc2num[ref] ? 0 : 1);
|
data[cycle][qual][dinuc_index].inc(base,ref);
|
||||||
|
if (CREATE_TRAINING_DATA)
|
||||||
|
dinuc_outs.get(dinuc_index).format("%d,%d,%d\n", qual, cycle, nuc2num[base]==nuc2num[ref] ? 0 : 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -120,17 +130,77 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
|
|
||||||
qualityEmpiricalObserved();
|
qualityEmpiricalObserved();
|
||||||
|
qualityDiffVsCycle();
|
||||||
|
qualityDiffVsDinucleotide();
|
||||||
|
|
||||||
// Close dinuc filehandles
|
// Close dinuc filehandles
|
||||||
for ( PrintStream dinuc_stream : this.dinuc_outs)
|
if (CREATE_TRAINING_DATA)
|
||||||
dinuc_stream.close();
|
for ( PrintStream dinuc_stream : this.dinuc_outs)
|
||||||
|
dinuc_stream.close();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class MeanReportedQuality {
|
||||||
|
double Qn = 0;
|
||||||
|
long n = 0;
|
||||||
|
|
||||||
|
void inc(double Q, long n) {
|
||||||
|
this.n += n;
|
||||||
|
Qn += Q * n;
|
||||||
|
}
|
||||||
|
|
||||||
|
double result() {
|
||||||
|
return Qn / n;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void qualityDiffVsCycle() {
|
||||||
|
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
|
||||||
|
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
|
||||||
|
ByCycleFile.printf("cycle,Qemp-obs,Qemp,Qobs,B,N%n");
|
||||||
|
for (int c=0; c < MAX_READ_LENGTH+1; c++) {
|
||||||
|
ByCycle.add(new RecalData(c, -1, "-"));
|
||||||
|
ByCycleReportedQ.add(new MeanReportedQuality());
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( RecalData datum: flattenData ) {
|
||||||
|
ByCycle.get(datum.pos).inc(datum.N, datum.B);
|
||||||
|
ByCycleReportedQ.get(datum.pos).inc(datum.qual, datum.N);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int c=0; c < MAX_READ_LENGTH+1; c++) {
|
||||||
|
double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N);
|
||||||
|
double reportedQual = ByCycleReportedQ.get(c).result();
|
||||||
|
ByCycleFile.printf("%d, %f, %f, %f, %d, %d%n", c, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void qualityDiffVsDinucleotide() {
|
||||||
|
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
|
||||||
|
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
|
||||||
|
ByDinucFile.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n");
|
||||||
|
for (int c=0; c < NDINUCS; c++) {
|
||||||
|
ByCycle.add(new RecalData(-1, -1, dinucIndex2bases(c)));
|
||||||
|
ByCycleReportedQ.add(new MeanReportedQuality());
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( RecalData datum: flattenData ) {
|
||||||
|
int dinucIndex = bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1));
|
||||||
|
ByCycle.get(dinucIndex).inc(datum.N, datum.B);
|
||||||
|
ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int c=0; c < NDINUCS; c++) {
|
||||||
|
double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N);
|
||||||
|
double reportedQual = ByCycleReportedQ.get(c).result();
|
||||||
|
ByDinucFile.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void qualityEmpiricalObserved() {
|
public void qualityEmpiricalObserved() {
|
||||||
|
|
||||||
ArrayList<RecalData> ByQ = new ArrayList<RecalData>();
|
ArrayList<RecalData> ByQ = new ArrayList<RecalData>();
|
||||||
ByQual.printf("Qcal, Qobs, B, N%n");
|
ByQualFile.printf("Qrep,Qemp,B,N%n");
|
||||||
for (int q=0; q<MAX_QUAL_SCORE+1; q++)
|
for (int q=0; q<MAX_QUAL_SCORE+1; q++)
|
||||||
ByQ.add(new RecalData(-1,q,"-"));
|
ByQ.add(new RecalData(-1,q,"-"));
|
||||||
|
|
||||||
|
|
@ -140,7 +210,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
|
|
||||||
for (int q=0; q<MAX_QUAL_SCORE; q++) {
|
for (int q=0; q<MAX_QUAL_SCORE; q++) {
|
||||||
double empiricalQual = -10 * Math.log10((double)ByQ.get(q).B / ByQ.get(q).N);
|
double empiricalQual = -10 * Math.log10((double)ByQ.get(q).B / ByQ.get(q).N);
|
||||||
ByQual.printf("%d, %f, %d, %d%n", q, empiricalQual, ByQ.get(q).B, ByQ.get(q).N);
|
ByQualFile.printf("%d, %f, %d, %d%n", q, empiricalQual, ByQ.get(q).B, ByQ.get(q).N);
|
||||||
//out.printf("%3d,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, dinuc, qual, empiricalQual, qual-empiricalQual, N, B);
|
//out.printf("%3d,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, dinuc, qual, empiricalQual, qual-empiricalQual, N, B);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -182,11 +252,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
|
|
||||||
// Print out data for regression
|
// Print out data for regression
|
||||||
public CovariateCounterWalker() throws FileNotFoundException {
|
public CovariateCounterWalker() throws FileNotFoundException {
|
||||||
for ( int i=0; i<NDINUCS; i++) {
|
if (CREATE_TRAINING_DATA)
|
||||||
PrintStream next_dinuc = new PrintStream( dinuc_root+"."+dinucIndex2bases(i)+".csv");
|
for ( int i=0; i<NDINUCS; i++) {
|
||||||
next_dinuc.println("logitQ,pos,indicator");
|
PrintStream next_dinuc = new PrintStream( dinuc_root+"."+dinucIndex2bases(i)+".csv");
|
||||||
dinuc_outs.add(next_dinuc);
|
next_dinuc.println("logitQ,pos,indicator");
|
||||||
}
|
dinuc_outs.add(next_dinuc);
|
||||||
ByQual = new PrintStream("by_quality.csv");
|
}
|
||||||
|
ByQualFile = new PrintStream(OUTPUT_FILEROOT+".empirical_v_reported_quality.csv");
|
||||||
|
ByCycleFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_cycle.csv");
|
||||||
|
ByDinucFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_dinucleotide.csv");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -96,25 +96,32 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
|
||||||
byte[] quals = read.getBaseQualities();
|
byte[] quals = read.getBaseQualities();
|
||||||
byte[] recalQuals = new byte[quals.length];
|
byte[] recalQuals = new byte[quals.length];
|
||||||
|
|
||||||
|
int numBases = read.getReadLength();
|
||||||
recalQuals[0] = quals[0]; // can't change the first -- no dinuc
|
recalQuals[0] = quals[0]; // can't change the first -- no dinuc
|
||||||
for ( int i = 1; i < bases.length; i++ ) {
|
recalQuals[numBases-1] = quals[numBases-1]; // can't change last -- no dinuc
|
||||||
int cycle = i;
|
for ( int i = 1; i < numBases-1; i++ ) { // skip first and last base, qual already set because no dinuc
|
||||||
|
// Take into account that previous base is the next base in terms of machine chemistry if this is a negative strand
|
||||||
|
int cycle = read.getReadNegativeStrandFlag() ? numBases - i - 1 : i;
|
||||||
|
String dinuc = String.format("%c%c", bases[i + (read.getReadNegativeStrandFlag() ? 1 : -1)], bases[i]);
|
||||||
byte qual = quals[i];
|
byte qual = quals[i];
|
||||||
String dinuc = String.format("%c%c", bases[i-1], bases[i]);
|
|
||||||
//System.out.printf("dinuc %c %c%n", bases[i-1], bases[i]);
|
//System.out.printf("dinuc %c %c%n", bases[i-1], bases[i]);
|
||||||
LogisticRegressor regressor = regressors.get(dinuc);
|
LogisticRegressor regressor = regressors.get(dinuc);
|
||||||
double P = -1;
|
byte newQual;
|
||||||
byte newQual = qual;
|
|
||||||
|
|
||||||
if ( regressor != null ) { // no N or some other unexpected bp in the stream
|
if ( regressor != null ) { // no N or some other unexpected bp in the stream
|
||||||
double logPOver1minusP = regressor.regress((double)cycle+1, (double)qual);
|
double gamma = regressor.regress((double)cycle+1, (double)qual);
|
||||||
|
double expGamma = Math.exp(gamma);
|
||||||
|
double finalP = expGamma / (1+expGamma);
|
||||||
|
newQual = QualityUtils.probToQual(1-finalP);
|
||||||
//newQual = -10 * Math.round(logPOver1minusP)
|
//newQual = -10 * Math.round(logPOver1minusP)
|
||||||
/*double POver1minusP = Math.pow(10, logPOver1minusP);
|
/*double POver1minusP = Math.pow(10, logPOver1minusP);
|
||||||
P = POver1minusP / (1 + POver1minusP);
|
P = POver1minusP / (1 + POver1minusP);*/
|
||||||
newQual = QualityUtils.probToQual(P);*/
|
//newQual = QualityUtils.probToQual(P);
|
||||||
|
|
||||||
newQual = (byte)Math.min(Math.round(-10*logPOver1minusP),63);
|
//newQual = (byte)Math.min(Math.round(-10*logPOver1minusP),63);
|
||||||
//System.out.printf("Recal %s %d %d => %f => %f leads to %d%n", dinuc, cycle, qual, logPOver1minusP, P, newQual);
|
//System.out.printf("Recal %s %d %d => %f => %f leads to %d%n", dinuc, cycle, qual, logPOver1minusP, P, newQual);
|
||||||
|
}else{
|
||||||
|
newQual = qual;
|
||||||
}
|
}
|
||||||
|
|
||||||
recalQuals[i] = newQual;
|
recalQuals[i] = newQual;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue