Several changes to CovariateCounter walker to print more tables (called vs. observed Q scores), bug fixes to LogisticRecalibrationWalker and LogisticRegressor, and print string functionality added to Pair.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@550 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a0a581171b
commit
58b2578c44
|
|
@ -18,7 +18,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
@Argument(fullName="MAX_READ_LENGTH", shortName="mrl", required=false,defaultValue="101")
|
||||
public int MAX_READ_LENGTH;
|
||||
|
||||
@Argument(fullName="MAX_QUAL_SCORE", shortName="mqs", required=false,defaultValue="34")
|
||||
@Argument(fullName="MAX_QUAL_SCORE", shortName="mqs", required=false,defaultValue="63")
|
||||
public int MAX_QUAL_SCORE;
|
||||
|
||||
@Argument(fullName="LOG_REG_TRAINING_FILEROOT", shortName="lrtf", required=false, defaultValue="dinuc", doc="Filename root for the outputted logistic regression training files")
|
||||
|
|
@ -35,6 +35,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
String dinuc_root = "dinuc";
|
||||
ArrayList<PrintStream> dinuc_outs = new ArrayList<PrintStream>();
|
||||
PrintStream covars_out = new PrintStream("covars.out");
|
||||
PrintStream ByQual = new PrintStream("quality_empirical_vs_observed.csv");
|
||||
|
||||
private class RecalData {
|
||||
long N;
|
||||
|
|
@ -49,9 +50,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
this.dinuc = dinuc;
|
||||
}
|
||||
|
||||
public void inc(long incN, long incB) {
|
||||
N += incN;
|
||||
B += incB;
|
||||
}
|
||||
|
||||
|
||||
public void inc(char curBase, char ref) {
|
||||
N++;
|
||||
B += nuc2num[curBase] == nuc2num[ref] ? 0 : 1;
|
||||
inc(1, nuc2num[curBase] == nuc2num[ref] ? 0 : 1);
|
||||
//out.printf("%s %s\n", curBase, ref);
|
||||
}
|
||||
|
||||
|
|
@ -83,7 +89,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
|
||||
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
rodDbSNP dbsnp = (rodDbSNP)tracker.lookup("dbSNP", null);
|
||||
if ( dbsnp == null ) {
|
||||
if ( dbsnp == null || !dbsnp.isSNP() ) {
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
for (int i =0; i < reads.size(); i++ ) {
|
||||
|
|
@ -95,9 +101,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
int qual = (int)read.getBaseQualities()[offset];
|
||||
//out.printf("%d %d %d\n", offset, qual, bases2dinucIndex(prevBase,base));
|
||||
int dinuc_index = bases2dinucIndex(prevBase,base);
|
||||
RecalData datum = data[offset][qual][dinuc_index];
|
||||
datum.inc(base,ref);
|
||||
dinuc_outs.get(dinuc_index).format("%d,%d,%d\n", qual, offset, base==ref ? 0 : 1);
|
||||
if (qual > 0 && qual <= MAX_QUAL_SCORE) {
|
||||
RecalData datum = data[offset][qual][dinuc_index];
|
||||
datum.inc(base,ref);
|
||||
dinuc_outs.get(dinuc_index).format("%d,%d,%d\n", qual, offset, nuc2num[base]==nuc2num[ref] ? 0 : 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -111,12 +119,32 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
covars_out.println(datum);
|
||||
}
|
||||
|
||||
qualityEmpiricalObserved();
|
||||
|
||||
// Close dinuc filehandles
|
||||
for ( PrintStream dinuc_stream : this.dinuc_outs)
|
||||
dinuc_stream.close();
|
||||
|
||||
}
|
||||
|
||||
public void qualityEmpiricalObserved() {
|
||||
|
||||
ArrayList<RecalData> ByQ = new ArrayList<RecalData>();
|
||||
ByQual.printf("Qcal, Qobs, B, N%n");
|
||||
for (int q=0; q<MAX_QUAL_SCORE+1; q++)
|
||||
ByQ.add(new RecalData(-1,q,"-"));
|
||||
|
||||
for ( RecalData datum: flattenData ){
|
||||
ByQ.get(datum.qual).inc(datum.N, datum.B);
|
||||
}
|
||||
|
||||
for (int q=0; q<MAX_QUAL_SCORE; q++) {
|
||||
double empiricalQual = -10 * Math.log10((double)ByQ.get(q).B / ByQ.get(q).N);
|
||||
ByQual.printf("%d, %f, %d, %d%n", q, empiricalQual, ByQ.get(q).B, ByQ.get(q).N);
|
||||
//out.printf("%3d,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, dinuc, qual, empiricalQual, qual-empiricalQual, N, B);
|
||||
}
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -159,5 +187,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
next_dinuc.println("logitQ,pos,indicator");
|
||||
dinuc_outs.add(next_dinuc);
|
||||
}
|
||||
ByQual = new PrintStream("by_quality.csv");
|
||||
}
|
||||
}
|
||||
|
|
@ -108,19 +108,21 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
|
|||
|
||||
if ( regressor != null ) { // no N or some other unexpected bp in the stream
|
||||
double logPOver1minusP = regressor.regress((double)cycle+1, (double)qual);
|
||||
double POver1minusP = Math.pow(10, logPOver1minusP);
|
||||
//newQual = -10 * Math.round(logPOver1minusP)
|
||||
/*double POver1minusP = Math.pow(10, logPOver1minusP);
|
||||
P = POver1minusP / (1 + POver1minusP);
|
||||
newQual = QualityUtils.probToQual(P);
|
||||
//newQual = (byte)Math.min(Math.round(newQualDouble),63);
|
||||
System.out.printf("Recal %s %d %d => %f => %f => %f leads to %d%n", dinuc, cycle, qual, logPOver1minusP, POver1minusP, P, newQual);
|
||||
newQual = QualityUtils.probToQual(P);*/
|
||||
|
||||
newQual = (byte)Math.min(Math.round(-10*logPOver1minusP),63);
|
||||
//System.out.printf("Recal %s %d %d => %f => %f leads to %d%n", dinuc, cycle, qual, logPOver1minusP, P, newQual);
|
||||
}
|
||||
|
||||
recalQuals[i] = newQual;
|
||||
}
|
||||
|
||||
System.out.printf("OLD: %s%n", read.format());
|
||||
//System.out.printf("OLD: %s%n", read.format());
|
||||
read.setBaseQualities(recalQuals);
|
||||
System.out.printf("NEW: %s%n", read.format());
|
||||
//System.out.printf("NEW: %s%n", read.format());
|
||||
return recalRead;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -27,8 +27,8 @@ public class LogisticRegressor {
|
|||
|
||||
// setup coefficient matrix
|
||||
coefficients = new double[order+1][order+1];
|
||||
for ( int i = 0; i < order; i++ ) {
|
||||
for ( int j = 0; j < order; j++ ) {
|
||||
for ( int i = 0; i <= order; i++ ) {
|
||||
for ( int j = 0; j <= order; j++ ) {
|
||||
coefficients[i][j] = 0.0;
|
||||
}
|
||||
}
|
||||
|
|
@ -44,10 +44,11 @@ public class LogisticRegressor {
|
|||
|
||||
public double regress(double f1, double f2) {
|
||||
double v = 0.0;
|
||||
for ( int i = 0; i < order; i++ ) {
|
||||
for ( int j = 0; j < order; j++ ) {
|
||||
for ( int i = 0; i <= order; i++ ) {
|
||||
for ( int j = 0; j <= order; j++ ) {
|
||||
double c = coefficients[i][j];
|
||||
v += c * Math.pow(f1,i) * Math.pow(f2, j);
|
||||
//System.out.printf("i=%d, j=%d, v=%f, c=%f, f1=%f, f2=%f, f1^i=%f, f2^j=%f%n", i, j, v, c, f1, f2, Math.pow(f1,i), Math.pow(f2,j));
|
||||
}
|
||||
}
|
||||
return v;
|
||||
|
|
@ -56,8 +57,8 @@ public class LogisticRegressor {
|
|||
public String toString() {
|
||||
StringBuilder s = new StringBuilder();
|
||||
s.append(String.format("nFeatures=%d, order=%d: ", nFeatures, order));
|
||||
for ( int i = 0; i < order; i++ ) {
|
||||
for ( int j = 0; j < order; j++ ) {
|
||||
for ( int i = 0; i <= order; i++ ) {
|
||||
for ( int j = 0; j <= order; j++ ) {
|
||||
s.append(" " + coefficients[i][j]);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -19,4 +19,8 @@ public class Pair<X,Y> {
|
|||
the member field.
|
||||
*/
|
||||
public Y getSecond() { return second; }
|
||||
|
||||
public String toString() {
|
||||
return first+","+second;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
#!/usr/bin/env python
|
||||
|
||||
import sys
|
||||
|
|
@ -5,7 +6,7 @@ import sys
|
|||
dinuc_root = sys.argv[1]
|
||||
fout = file(dinuc_root+".log_reg_params", "w")
|
||||
|
||||
fout.write("p,q\t")
|
||||
fout.write("dinuc\t")
|
||||
for p in range(5):
|
||||
for q in range(5):
|
||||
fout.write("%d,%d\t" % (p,q))
|
||||
|
|
@ -13,7 +14,7 @@ fout.write("\n")
|
|||
|
||||
for dinuc in ("AA","AC","AG","AT","CA","CC","CG","CT","GA","GC","GG","GT","TA","TC","TG","TT"):
|
||||
dinin = open(dinuc_root+"."+dinuc+".parameters")
|
||||
dinin.readline()
|
||||
#dinin.readline()
|
||||
params = []
|
||||
for line in dinin:
|
||||
line.rstrip("\n")
|
||||
|
|
|
|||
Loading…
Reference in New Issue