From b630f2f2f1be27a7f13f9497b807ff8e5a93bf4f Mon Sep 17 00:00:00 2001 From: andrewk Date: Thu, 30 Apr 2009 01:22:50 +0000 Subject: [PATCH] More tables output by CovariateCounterWalker AND made CovariateCounterWalker and LogisticRecalibration aware of positive and negative strandedness of data which changes the regression output significantly. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@568 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/CovariateCounterWalker.java | 109 +++++++++++++++--- .../walkers/LogisticRecalibrationWalker.java | 25 ++-- 2 files changed, 107 insertions(+), 27 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java index a7b3a4b7a..b053ac8b3 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java @@ -21,8 +21,11 @@ public class CovariateCounterWalker extends LocusWalker { @Argument(fullName="MAX_QUAL_SCORE", shortName="mqs", required=false,defaultValue="63") public int MAX_QUAL_SCORE; - @Argument(fullName="LOG_REG_TRAINING_FILEROOT", shortName="lrtf", required=false, defaultValue="dinuc", doc="Filename root for the outputted logistic regression training files") - public String LOG_REG_TRAINING_FILEROOT; + @Argument(fullName="OUTPUT_FILEROOT", shortName="outroot", required=false, defaultValue="output", doc="Filename root for the outputted logistic regression training files") + public String OUTPUT_FILEROOT; + + @Argument(fullName="CREATE_TRAINING_DATA", shortName="trainingdata", required=false, defaultValue="false", doc="Create training data files for logistic regression") + public boolean CREATE_TRAINING_DATA; int NDINUCS = 16; RecalData[][][] data = new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS]; @@ -35,7 +38,9 @@ public class CovariateCounterWalker extends LocusWalker { String dinuc_root = "dinuc"; ArrayList dinuc_outs = new ArrayList(); PrintStream covars_out = new PrintStream("covars.out"); - PrintStream ByQual = new PrintStream("quality_empirical_vs_observed.csv"); + PrintStream ByQualFile; // = new PrintStream("quality_empirical_vs_observed.csv"); + PrintStream ByCycleFile; + PrintStream ByDinucFile; private class RecalData { long N; @@ -93,18 +98,23 @@ public class CovariateCounterWalker extends LocusWalker { List reads = context.getReads(); List offsets = context.getOffsets(); for (int i =0; i < reads.size(); i++ ) { + SAMRecord read = reads.get(i); int offset = offsets.get(i); - if ( offset > 0 ) { + int numBases = read.getReadLength(); + if ( offset > 0 && offset < (numBases-1) ) { // skip first and last bases because they suck and they don't have a dinuc count char base = (char)read.getReadBases()[offset]; - char prevBase = (char)read.getReadBases()[offset-1]; int qual = (int)read.getBaseQualities()[offset]; //out.printf("%d %d %d\n", offset, qual, bases2dinucIndex(prevBase,base)); + // previous base is the next base in terms of machine chemistry if this is a negative strand + char prevBase = (char)read.getReadBases()[offset + (read.getReadNegativeStrandFlag() ? 1 : -1)]; int dinuc_index = bases2dinucIndex(prevBase,base); if (qual > 0 && qual <= MAX_QUAL_SCORE) { - RecalData datum = data[offset][qual][dinuc_index]; - datum.inc(base,ref); - dinuc_outs.get(dinuc_index).format("%d,%d,%d\n", qual, offset, nuc2num[base]==nuc2num[ref] ? 0 : 1); + // Convert offset into cycle position which means reversing the position of reads on the negative strand + int cycle = read.getReadNegativeStrandFlag() ? numBases - offset - 1 : offset; + data[cycle][qual][dinuc_index].inc(base,ref); + if (CREATE_TRAINING_DATA) + dinuc_outs.get(dinuc_index).format("%d,%d,%d\n", qual, cycle, nuc2num[base]==nuc2num[ref] ? 0 : 1); } } } @@ -120,17 +130,77 @@ public class CovariateCounterWalker extends LocusWalker { } qualityEmpiricalObserved(); + qualityDiffVsCycle(); + qualityDiffVsDinucleotide(); // Close dinuc filehandles - for ( PrintStream dinuc_stream : this.dinuc_outs) - dinuc_stream.close(); + if (CREATE_TRAINING_DATA) + for ( PrintStream dinuc_stream : this.dinuc_outs) + dinuc_stream.close(); } + class MeanReportedQuality { + double Qn = 0; + long n = 0; + + void inc(double Q, long n) { + this.n += n; + Qn += Q * n; + } + + double result() { + return Qn / n; + } + } + + public void qualityDiffVsCycle() { + ArrayList ByCycle = new ArrayList(); + ArrayList ByCycleReportedQ = new ArrayList(); + ByCycleFile.printf("cycle,Qemp-obs,Qemp,Qobs,B,N%n"); + for (int c=0; c < MAX_READ_LENGTH+1; c++) { + ByCycle.add(new RecalData(c, -1, "-")); + ByCycleReportedQ.add(new MeanReportedQuality()); + } + + for ( RecalData datum: flattenData ) { + ByCycle.get(datum.pos).inc(datum.N, datum.B); + ByCycleReportedQ.get(datum.pos).inc(datum.qual, datum.N); + } + + for (int c=0; c < MAX_READ_LENGTH+1; c++) { + double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N); + double reportedQual = ByCycleReportedQ.get(c).result(); + ByCycleFile.printf("%d, %f, %f, %f, %d, %d%n", c, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N); + } + } + + public void qualityDiffVsDinucleotide() { + ArrayList ByCycle = new ArrayList(); + ArrayList ByCycleReportedQ = new ArrayList(); + ByDinucFile.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n"); + for (int c=0; c < NDINUCS; c++) { + ByCycle.add(new RecalData(-1, -1, dinucIndex2bases(c))); + ByCycleReportedQ.add(new MeanReportedQuality()); + } + + for ( RecalData datum: flattenData ) { + int dinucIndex = bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1)); + ByCycle.get(dinucIndex).inc(datum.N, datum.B); + ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N); + } + + for (int c=0; c < NDINUCS; c++) { + double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N); + double reportedQual = ByCycleReportedQ.get(c).result(); + ByDinucFile.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N); + } + } + public void qualityEmpiricalObserved() { ArrayList ByQ = new ArrayList(); - ByQual.printf("Qcal, Qobs, B, N%n"); + ByQualFile.printf("Qrep,Qemp,B,N%n"); for (int q=0; q { for (int q=0; q { // Print out data for regression public CovariateCounterWalker() throws FileNotFoundException { - for ( int i=0; i %f => %f leads to %d%n", dinuc, cycle, qual, logPOver1minusP, P, newQual); + }else{ + newQual = qual; } recalQuals[i] = newQual;