Several changes to CovariateCounter walker to print more tables (called vs. observed Q scores), bug fixes to LogisticRecalibrationWalker and LogisticRegressor, and print string functionality added to Pair.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@550 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
andrewk 2009-04-28 00:37:48 +00:00
parent a0a581171b
commit 58b2578c44
5 changed files with 58 additions and 21 deletions

View File

@ -18,7 +18,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
@Argument(fullName="MAX_READ_LENGTH", shortName="mrl", required=false,defaultValue="101")
public int MAX_READ_LENGTH;
@Argument(fullName="MAX_QUAL_SCORE", shortName="mqs", required=false,defaultValue="34")
@Argument(fullName="MAX_QUAL_SCORE", shortName="mqs", required=false,defaultValue="63")
public int MAX_QUAL_SCORE;
@Argument(fullName="LOG_REG_TRAINING_FILEROOT", shortName="lrtf", required=false, defaultValue="dinuc", doc="Filename root for the outputted logistic regression training files")
@ -35,6 +35,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
String dinuc_root = "dinuc";
ArrayList<PrintStream> dinuc_outs = new ArrayList<PrintStream>();
PrintStream covars_out = new PrintStream("covars.out");
PrintStream ByQual = new PrintStream("quality_empirical_vs_observed.csv");
private class RecalData {
long N;
@ -49,9 +50,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
this.dinuc = dinuc;
}
public void inc(long incN, long incB) {
N += incN;
B += incB;
}
public void inc(char curBase, char ref) {
N++;
B += nuc2num[curBase] == nuc2num[ref] ? 0 : 1;
inc(1, nuc2num[curBase] == nuc2num[ref] ? 0 : 1);
//out.printf("%s %s\n", curBase, ref);
}
@ -83,7 +89,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
rodDbSNP dbsnp = (rodDbSNP)tracker.lookup("dbSNP", null);
if ( dbsnp == null ) {
if ( dbsnp == null || !dbsnp.isSNP() ) {
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
for (int i =0; i < reads.size(); i++ ) {
@ -95,9 +101,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
int qual = (int)read.getBaseQualities()[offset];
//out.printf("%d %d %d\n", offset, qual, bases2dinucIndex(prevBase,base));
int dinuc_index = bases2dinucIndex(prevBase,base);
RecalData datum = data[offset][qual][dinuc_index];
datum.inc(base,ref);
dinuc_outs.get(dinuc_index).format("%d,%d,%d\n", qual, offset, base==ref ? 0 : 1);
if (qual > 0 && qual <= MAX_QUAL_SCORE) {
RecalData datum = data[offset][qual][dinuc_index];
datum.inc(base,ref);
dinuc_outs.get(dinuc_index).format("%d,%d,%d\n", qual, offset, nuc2num[base]==nuc2num[ref] ? 0 : 1);
}
}
}
}
@ -111,12 +119,32 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
covars_out.println(datum);
}
qualityEmpiricalObserved();
// Close dinuc filehandles
for ( PrintStream dinuc_stream : this.dinuc_outs)
dinuc_stream.close();
}
public void qualityEmpiricalObserved() {
ArrayList<RecalData> ByQ = new ArrayList<RecalData>();
ByQual.printf("Qcal, Qobs, B, N%n");
for (int q=0; q<MAX_QUAL_SCORE+1; q++)
ByQ.add(new RecalData(-1,q,"-"));
for ( RecalData datum: flattenData ){
ByQ.get(datum.qual).inc(datum.N, datum.B);
}
for (int q=0; q<MAX_QUAL_SCORE; q++) {
double empiricalQual = -10 * Math.log10((double)ByQ.get(q).B / ByQ.get(q).N);
ByQual.printf("%d, %f, %d, %d%n", q, empiricalQual, ByQ.get(q).B, ByQ.get(q).N);
//out.printf("%3d,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, dinuc, qual, empiricalQual, qual-empiricalQual, N, B);
}
}
public Integer reduceInit() {
return 0;
}
@ -159,5 +187,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
next_dinuc.println("logitQ,pos,indicator");
dinuc_outs.add(next_dinuc);
}
ByQual = new PrintStream("by_quality.csv");
}
}

View File

@ -108,19 +108,21 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
if ( regressor != null ) { // no N or some other unexpected bp in the stream
double logPOver1minusP = regressor.regress((double)cycle+1, (double)qual);
double POver1minusP = Math.pow(10, logPOver1minusP);
//newQual = -10 * Math.round(logPOver1minusP)
/*double POver1minusP = Math.pow(10, logPOver1minusP);
P = POver1minusP / (1 + POver1minusP);
newQual = QualityUtils.probToQual(P);
//newQual = (byte)Math.min(Math.round(newQualDouble),63);
System.out.printf("Recal %s %d %d => %f => %f => %f leads to %d%n", dinuc, cycle, qual, logPOver1minusP, POver1minusP, P, newQual);
newQual = QualityUtils.probToQual(P);*/
newQual = (byte)Math.min(Math.round(-10*logPOver1minusP),63);
//System.out.printf("Recal %s %d %d => %f => %f leads to %d%n", dinuc, cycle, qual, logPOver1minusP, P, newQual);
}
recalQuals[i] = newQual;
}
System.out.printf("OLD: %s%n", read.format());
//System.out.printf("OLD: %s%n", read.format());
read.setBaseQualities(recalQuals);
System.out.printf("NEW: %s%n", read.format());
//System.out.printf("NEW: %s%n", read.format());
return recalRead;
}

View File

@ -27,8 +27,8 @@ public class LogisticRegressor {
// setup coefficient matrix
coefficients = new double[order+1][order+1];
for ( int i = 0; i < order; i++ ) {
for ( int j = 0; j < order; j++ ) {
for ( int i = 0; i <= order; i++ ) {
for ( int j = 0; j <= order; j++ ) {
coefficients[i][j] = 0.0;
}
}
@ -44,10 +44,11 @@ public class LogisticRegressor {
public double regress(double f1, double f2) {
double v = 0.0;
for ( int i = 0; i < order; i++ ) {
for ( int j = 0; j < order; j++ ) {
for ( int i = 0; i <= order; i++ ) {
for ( int j = 0; j <= order; j++ ) {
double c = coefficients[i][j];
v += c * Math.pow(f1,i) * Math.pow(f2, j);
//System.out.printf("i=%d, j=%d, v=%f, c=%f, f1=%f, f2=%f, f1^i=%f, f2^j=%f%n", i, j, v, c, f1, f2, Math.pow(f1,i), Math.pow(f2,j));
}
}
return v;
@ -56,8 +57,8 @@ public class LogisticRegressor {
public String toString() {
StringBuilder s = new StringBuilder();
s.append(String.format("nFeatures=%d, order=%d: ", nFeatures, order));
for ( int i = 0; i < order; i++ ) {
for ( int j = 0; j < order; j++ ) {
for ( int i = 0; i <= order; i++ ) {
for ( int j = 0; j <= order; j++ ) {
s.append(" " + coefficients[i][j]);
}
}

View File

@ -19,4 +19,8 @@ public class Pair<X,Y> {
the member field.
*/
public Y getSecond() { return second; }
public String toString() {
return first+","+second;
}
}

View File

@ -1,3 +1,4 @@
#!/usr/bin/env python
import sys
@ -5,7 +6,7 @@ import sys
dinuc_root = sys.argv[1]
fout = file(dinuc_root+".log_reg_params", "w")
fout.write("p,q\t")
fout.write("dinuc\t")
for p in range(5):
for q in range(5):
fout.write("%d,%d\t" % (p,q))
@ -13,7 +14,7 @@ fout.write("\n")
for dinuc in ("AA","AC","AG","AT","CA","CC","CG","CT","GA","GC","GG","GT","TA","TC","TG","TT"):
dinin = open(dinuc_root+"."+dinuc+".parameters")
dinin.readline()
#dinin.readline()
params = []
for line in dinin:
line.rstrip("\n")