From 58b2578c44418b46f103683bc5659424c21104cd Mon Sep 17 00:00:00 2001 From: andrewk Date: Tue, 28 Apr 2009 00:37:48 +0000 Subject: [PATCH] Several changes to CovariateCounter walker to print more tables (called vs. observed Q scores), bug fixes to LogisticRecalibrationWalker and LogisticRegressor, and print string functionality added to Pair. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@550 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/CovariateCounterWalker.java | 43 ++++++++++++++++--- .../walkers/LogisticRecalibrationWalker.java | 14 +++--- .../sting/utils/LogisticRegressor.java | 13 +++--- .../org/broadinstitute/sting/utils/Pair.java | 4 ++ python/LogRegression.py | 5 ++- 5 files changed, 58 insertions(+), 21 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java index 695eecbe7..a7b3a4b7a 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java @@ -18,7 +18,7 @@ public class CovariateCounterWalker extends LocusWalker { @Argument(fullName="MAX_READ_LENGTH", shortName="mrl", required=false,defaultValue="101") public int MAX_READ_LENGTH; - @Argument(fullName="MAX_QUAL_SCORE", shortName="mqs", required=false,defaultValue="34") + @Argument(fullName="MAX_QUAL_SCORE", shortName="mqs", required=false,defaultValue="63") public int MAX_QUAL_SCORE; @Argument(fullName="LOG_REG_TRAINING_FILEROOT", shortName="lrtf", required=false, defaultValue="dinuc", doc="Filename root for the outputted logistic regression training files") @@ -35,6 +35,7 @@ public class CovariateCounterWalker extends LocusWalker { String dinuc_root = "dinuc"; ArrayList dinuc_outs = new ArrayList(); PrintStream covars_out = new PrintStream("covars.out"); + PrintStream ByQual = new PrintStream("quality_empirical_vs_observed.csv"); private class RecalData { long N; @@ -49,9 +50,14 @@ public class CovariateCounterWalker extends LocusWalker { this.dinuc = dinuc; } + public void inc(long incN, long incB) { + N += incN; + B += incB; + } + + public void inc(char curBase, char ref) { - N++; - B += nuc2num[curBase] == nuc2num[ref] ? 0 : 1; + inc(1, nuc2num[curBase] == nuc2num[ref] ? 0 : 1); //out.printf("%s %s\n", curBase, ref); } @@ -83,7 +89,7 @@ public class CovariateCounterWalker extends LocusWalker { public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) { rodDbSNP dbsnp = (rodDbSNP)tracker.lookup("dbSNP", null); - if ( dbsnp == null ) { + if ( dbsnp == null || !dbsnp.isSNP() ) { List reads = context.getReads(); List offsets = context.getOffsets(); for (int i =0; i < reads.size(); i++ ) { @@ -95,9 +101,11 @@ public class CovariateCounterWalker extends LocusWalker { int qual = (int)read.getBaseQualities()[offset]; //out.printf("%d %d %d\n", offset, qual, bases2dinucIndex(prevBase,base)); int dinuc_index = bases2dinucIndex(prevBase,base); - RecalData datum = data[offset][qual][dinuc_index]; - datum.inc(base,ref); - dinuc_outs.get(dinuc_index).format("%d,%d,%d\n", qual, offset, base==ref ? 0 : 1); + if (qual > 0 && qual <= MAX_QUAL_SCORE) { + RecalData datum = data[offset][qual][dinuc_index]; + datum.inc(base,ref); + dinuc_outs.get(dinuc_index).format("%d,%d,%d\n", qual, offset, nuc2num[base]==nuc2num[ref] ? 0 : 1); + } } } } @@ -111,12 +119,32 @@ public class CovariateCounterWalker extends LocusWalker { covars_out.println(datum); } + qualityEmpiricalObserved(); + // Close dinuc filehandles for ( PrintStream dinuc_stream : this.dinuc_outs) dinuc_stream.close(); } + public void qualityEmpiricalObserved() { + + ArrayList ByQ = new ArrayList(); + ByQual.printf("Qcal, Qobs, B, N%n"); + for (int q=0; q { next_dinuc.println("logitQ,pos,indicator"); dinuc_outs.add(next_dinuc); } + ByQual = new PrintStream("by_quality.csv"); } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/LogisticRecalibrationWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/LogisticRecalibrationWalker.java index a6a15e507..2f9bdca6e 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/LogisticRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/LogisticRecalibrationWalker.java @@ -108,19 +108,21 @@ public class LogisticRecalibrationWalker extends ReadWalker %f => %f => %f leads to %d%n", dinuc, cycle, qual, logPOver1minusP, POver1minusP, P, newQual); + newQual = QualityUtils.probToQual(P);*/ + + newQual = (byte)Math.min(Math.round(-10*logPOver1minusP),63); + //System.out.printf("Recal %s %d %d => %f => %f leads to %d%n", dinuc, cycle, qual, logPOver1minusP, P, newQual); } recalQuals[i] = newQual; } - System.out.printf("OLD: %s%n", read.format()); + //System.out.printf("OLD: %s%n", read.format()); read.setBaseQualities(recalQuals); - System.out.printf("NEW: %s%n", read.format()); + //System.out.printf("NEW: %s%n", read.format()); return recalRead; } diff --git a/java/src/org/broadinstitute/sting/utils/LogisticRegressor.java b/java/src/org/broadinstitute/sting/utils/LogisticRegressor.java index cdf6ab013..3c9efc21c 100755 --- a/java/src/org/broadinstitute/sting/utils/LogisticRegressor.java +++ b/java/src/org/broadinstitute/sting/utils/LogisticRegressor.java @@ -27,8 +27,8 @@ public class LogisticRegressor { // setup coefficient matrix coefficients = new double[order+1][order+1]; - for ( int i = 0; i < order; i++ ) { - for ( int j = 0; j < order; j++ ) { + for ( int i = 0; i <= order; i++ ) { + for ( int j = 0; j <= order; j++ ) { coefficients[i][j] = 0.0; } } @@ -44,10 +44,11 @@ public class LogisticRegressor { public double regress(double f1, double f2) { double v = 0.0; - for ( int i = 0; i < order; i++ ) { - for ( int j = 0; j < order; j++ ) { + for ( int i = 0; i <= order; i++ ) { + for ( int j = 0; j <= order; j++ ) { double c = coefficients[i][j]; v += c * Math.pow(f1,i) * Math.pow(f2, j); + //System.out.printf("i=%d, j=%d, v=%f, c=%f, f1=%f, f2=%f, f1^i=%f, f2^j=%f%n", i, j, v, c, f1, f2, Math.pow(f1,i), Math.pow(f2,j)); } } return v; @@ -56,8 +57,8 @@ public class LogisticRegressor { public String toString() { StringBuilder s = new StringBuilder(); s.append(String.format("nFeatures=%d, order=%d: ", nFeatures, order)); - for ( int i = 0; i < order; i++ ) { - for ( int j = 0; j < order; j++ ) { + for ( int i = 0; i <= order; i++ ) { + for ( int j = 0; j <= order; j++ ) { s.append(" " + coefficients[i][j]); } } diff --git a/java/src/org/broadinstitute/sting/utils/Pair.java b/java/src/org/broadinstitute/sting/utils/Pair.java index e7cbf2e3f..cd0b09671 100644 --- a/java/src/org/broadinstitute/sting/utils/Pair.java +++ b/java/src/org/broadinstitute/sting/utils/Pair.java @@ -19,4 +19,8 @@ public class Pair { the member field. */ public Y getSecond() { return second; } + + public String toString() { + return first+","+second; + } } diff --git a/python/LogRegression.py b/python/LogRegression.py index b2c50e668..745c44c9f 100755 --- a/python/LogRegression.py +++ b/python/LogRegression.py @@ -1,3 +1,4 @@ + #!/usr/bin/env python import sys @@ -5,7 +6,7 @@ import sys dinuc_root = sys.argv[1] fout = file(dinuc_root+".log_reg_params", "w") -fout.write("p,q\t") +fout.write("dinuc\t") for p in range(5): for q in range(5): fout.write("%d,%d\t" % (p,q)) @@ -13,7 +14,7 @@ fout.write("\n") for dinuc in ("AA","AC","AG","AT","CA","CC","CG","CT","GA","GC","GG","GT","TA","TC","TG","TT"): dinin = open(dinuc_root+"."+dinuc+".parameters") - dinin.readline() + #dinin.readline() params = [] for line in dinin: line.rstrip("\n")