From 0219d33e10f7d7dbe7452f845b1a5ce23064bb62 Mon Sep 17 00:00:00 2001 From: andrewk Date: Thu, 21 May 2009 18:30:45 +0000 Subject: [PATCH] QualityUtils: added reverse function to reverse an array of bytes (and not complement it), BaseUtils: split qualToProb into itself and qualToErrProb, CovariateCounterWalker and LogisticRecalibrationWalker: several changes including a properly acocunting (only partly complete) for reversing AND complementing bases that are negative strand, PrintReadsWalker: created option to output reads to a BAM file rather than just to the sceern (useful for creating a downsampled BAM file) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@770 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/walkers/PrintReadsWalker.java | 48 +++++-- .../gatk/walkers/CovariateCounterWalker.java | 119 ++++++++++++++---- .../walkers/LogisticRecalibrationWalker.java | 21 ++-- .../broadinstitute/sting/utils/BaseUtils.java | 15 +++ .../sting/utils/QualityUtils.java | 13 +- 5 files changed, 173 insertions(+), 43 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java index 698a310b8..dae6c404b 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java @@ -1,17 +1,49 @@ package org.broadinstitute.sting.gatk.walkers; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.LocusContext; +import net.sf.samtools.SAMFileWriter; +import net.sf.samtools.SAMFileWriterFactory; +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.utils.cmdLine.Argument; -public class PrintReadsWalker extends ReadWalker { - public Integer map(char[] ref, SAMRecord read) { - out.println(read.format()); - return 1; +import java.io.PrintStream; +import java.io.FileNotFoundException; +import java.io.File; +import java.util.Random; + +public class PrintReadsWalker extends ReadWalker { + + @Argument(fullName="outputBamFile", shortName="of", doc="Write output to this BAM filename instead of STDOUT", required=false) + String outputBamFile = null; + + public SAMRecord map(char[] ref, SAMRecord read) { + return read; } - public Integer reduceInit() { return 0; } + public SAMFileWriter reduceInit() { + if ( outputBamFile != null ) { // ! outputBamFile.equals("") ) { + SAMFileWriterFactory fact = new SAMFileWriterFactory(); + SAMFileHeader header = this.getToolkit().getEngine().getSAMHeader(); + return fact.makeBAMWriter(header, true, new File(outputBamFile)); + } + else { + return null; + } + } - public Integer reduce(Integer value, Integer sum) { - return value + sum; + public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) { + if ( output != null ) { + output.addAlignment(read); + } else { + out.println(read.format()); + } + + return output; + } + + public void onTraversalDone(SAMFileWriter output) { + if ( output != null ) { + output.close(); + } } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java index 16029b89c..27018f04e 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.refdata.rodDbSNP; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.WalkerName; import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.QualityUtils; import java.util.ArrayList; import java.util.List; @@ -19,14 +20,14 @@ public class CovariateCounterWalker extends LocusWalker { @Argument(fullName="MAX_READ_LENGTH", shortName="mrl", doc="max read length", required=false) public int MAX_READ_LENGTH = 101; - @Argument(fullName="MAX_QUAL_SCORE", shortName="mqs", doc="max quality score", required=false) + @Argument(fullName="MAX_QUAL_SCORE", shortName="mqs" , doc="max quality score", required=false) public int MAX_QUAL_SCORE = 63; @Argument(fullName="OUTPUT_FILEROOT", shortName="outroot", required=false, doc="Filename root for the outputted logistic regression training files") public String OUTPUT_FILEROOT = "output"; @Argument(fullName="CREATE_TRAINING_DATA", shortName="trainingdata", required=false, doc="Create training data files for logistic regression") - public boolean CREATE_TRAINING_DATA = false; + public boolean CREATE_TRAINING_DATA = true; @Argument(fullName="DOWNSAMPLE_FRACTION", shortName="sample", required=false, doc="Fraction of bases to randomly sample") public float DOWNSAMPLE_FRACTION=1.0f; @@ -40,11 +41,14 @@ public class CovariateCounterWalker extends LocusWalker { int NDINUCS = 16; RecalData[][][] data = new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS]; //RecalData[][][] data = new RecalData; - ArrayList flattenData = new ArrayList(); + ArrayList flattenData = new ArrayList(); static int nuc2num[]; static char num2nuc[]; + long counted_sites = 0; // number of sites used to count covariates + long skipped_sites = 0; // number of sites skipped because of a dbSNP entry + String dinuc_root = "dinuc"; ArrayList dinuc_outs = new ArrayList(); PrintStream covars_out = new PrintStream("covars.out"); @@ -100,6 +104,16 @@ public class CovariateCounterWalker extends LocusWalker { } } } + + try { + ByQualFile = new PrintStream(OUTPUT_FILEROOT+".empirical_v_reported_quality.csv"); + ByCycleFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_cycle.csv"); + ByDinucFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_dinucleotide.csv"); + } catch (FileNotFoundException e){ + System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT); + System.exit(1); + } + } public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) { @@ -110,7 +124,6 @@ public class CovariateCounterWalker extends LocusWalker { for (int i =0; i < reads.size(); i++ ) { SAMRecord read = reads.get(i); - if ((READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) && (read.getMappingQuality() >= MIN_MAPPING_QUALITY) && (DOWNSAMPLE_FRACTION == 1.0 || random_genrator.nextFloat() < DOWNSAMPLE_FRACTION)) { @@ -118,13 +131,15 @@ public class CovariateCounterWalker extends LocusWalker { int offset = offsets.get(i); int numBases = read.getReadLength(); if ( offset > 0 && offset < (numBases-1) ) { // skip first and last bases because they suck and they don't have a dinuc count - char base = (char)read.getReadBases()[offset]; int qual = (int)read.getBaseQualities()[offset]; - //out.printf("%d %d %d\n", offset, qual, bases2dinucIndex(prevBase,base)); - // previous base is the next base in terms of machine chemistry if this is a negative strand - char prevBase = (char)read.getReadBases()[offset + (read.getReadNegativeStrandFlag() ? 1 : -1)]; - int dinuc_index = bases2dinucIndex(prevBase,base); if (qual > 0 && qual <= MAX_QUAL_SCORE) { + // previous base is the next base in terms of machine chemistry if this is a negative strand + char base = (char)read.getReadBases()[offset]; + char prevBase = (char)read.getReadBases()[offset + (read.getReadNegativeStrandFlag() ? 1 : -1)]; + int dinuc_index = bases2dinucIndex(prevBase, base, read.getReadNegativeStrandFlag()); + //char prevBase = (char)read.getReadBases()[offset + (read.getReadNegativeStrandFlag() ? 1 : -1)]; + //int dinuc_index = bases2dinucIndex(prevBase, base, read.getReadNegativeStrandFlag()); + // Convert offset into cycle position which means reversing the position of reads on the negative strand int cycle = read.getReadNegativeStrandFlag() ? numBases - offset - 1 : offset; data[cycle][qual][dinuc_index].inc(base,ref); @@ -132,6 +147,9 @@ public class CovariateCounterWalker extends LocusWalker { } } } + counted_sites += 1; + }else{ + skipped_sites += 1; } return 1; } @@ -147,23 +165,32 @@ public class CovariateCounterWalker extends LocusWalker { qualityDiffVsCycle(); qualityDiffVsDinucleotide(); + out.printf("Counted sites: %d%n", counted_sites); + out.printf("Skipped sites: %d%n", skipped_sites); + out.printf("Fraction skipped: 1/%.0f%n", (double)counted_sites / skipped_sites); + // Close dinuc filehandles if (CREATE_TRAINING_DATA) writeTrainingData(); + /*if (CREATE_TRAINING_DATA) + for ( PrintStream dinuc_stream : this.dinuc_outs) + dinuc_stream.close();*/ } void writeTrainingData() { - for ( int i=0; i 0) - dinuc_out.format("%d,%d,%d,%d\n", datum.qual, datum.pos, 0, datum.N - datum.B); - if (datum.B > 0) - dinuc_out.format("%d,%d,%d,%d\n", datum.qual, datum.pos, 0, datum.B); + if (string2dinucIndex(datum.dinuc) == dinuc_index) { + if ((datum.N - datum.B) > 0) + dinuc_out.format("%d,%d,%d,%d\n", datum.qual, datum.pos, 0, datum.N - datum.B); + if (datum.B > 0) + dinuc_out.format("%d,%d,%d,%d\n", datum.qual, datum.pos, 1, datum.B); + } } } catch (FileNotFoundException e) { System.err.println("FileNotFoundException: " + e.getMessage()); @@ -177,14 +204,18 @@ public class CovariateCounterWalker extends LocusWalker { class MeanReportedQuality { double Qn = 0; long n = 0; + double sumErrors = 0; // Count of estimated number of errors void inc(double Q, long n) { this.n += n; - Qn += Q * n; + //Qn += Q * n; // wrong calculation that worked did not account for Qs being in log space + sumErrors += QualityUtils.qualToErrorProb((byte)Q) * n; } double result() { - return Qn / n; + //return Qn / n; + return -10 * Math.log10(sumErrors / n); + //return QualityUtils.probToQual(1.0 - (sumErrors / n)); } } @@ -192,6 +223,8 @@ public class CovariateCounterWalker extends LocusWalker { ArrayList ByCycle = new ArrayList(); ArrayList ByCycleReportedQ = new ArrayList(); ByCycleFile.printf("cycle,Qemp-obs,Qemp,Qobs,B,N%n"); + RecalData All = new RecalData(0,0,""); + MeanReportedQuality AllReported = new MeanReportedQuality(); for (int c=0; c < MAX_READ_LENGTH+1; c++) { ByCycle.add(new RecalData(c, -1, "-")); ByCycleReportedQ.add(new MeanReportedQuality()); @@ -200,6 +233,8 @@ public class CovariateCounterWalker extends LocusWalker { for ( RecalData datum: flattenData ) { ByCycle.get(datum.pos).inc(datum.N, datum.B); ByCycleReportedQ.get(datum.pos).inc(datum.qual, datum.N); + All.inc(datum.N, datum.B); + AllReported.inc(datum.qual, datum.N); } for (int c=0; c < MAX_READ_LENGTH+1; c++) { @@ -207,21 +242,27 @@ public class CovariateCounterWalker extends LocusWalker { double reportedQual = ByCycleReportedQ.get(c).result(); ByCycleFile.printf("%d, %f, %f, %f, %d, %d%n", c, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N); } + System.out.printf("Cycle: N=%d, B=%d, Qemp=%.1f, ", All.N, All.B, -10 * Math.log10((double)All.B/All.N)); + System.out.printf("Qrep=%.1f%n", AllReported.result()); } public void qualityDiffVsDinucleotide() { ArrayList ByCycle = new ArrayList(); ArrayList ByCycleReportedQ = new ArrayList(); ByDinucFile.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n"); + RecalData All = new RecalData(0,0,""); + MeanReportedQuality AllReported = new MeanReportedQuality(); for (int c=0; c < NDINUCS; c++) { ByCycle.add(new RecalData(-1, -1, dinucIndex2bases(c))); ByCycleReportedQ.add(new MeanReportedQuality()); } for ( RecalData datum: flattenData ) { - int dinucIndex = bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1)); + int dinucIndex = string2dinucIndex(datum.dinuc); //bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1), false); ByCycle.get(dinucIndex).inc(datum.N, datum.B); ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N); + All.inc(datum.N, datum.B); + AllReported.inc(datum.qual, datum.N); } for (int c=0; c < NDINUCS; c++) { @@ -229,24 +270,37 @@ public class CovariateCounterWalker extends LocusWalker { double reportedQual = ByCycleReportedQ.get(c).result(); ByDinucFile.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N); } + System.out.printf("Dinuc: N=%d, B=%d, Qemp=%.1f, ", All.N, All.B, -10 * Math.log10((double)All.B/All.N)); + System.out.printf("Qrep=%.1f%n", AllReported.result()); } public void qualityEmpiricalObserved() { ArrayList ByQ = new ArrayList(); - ByQualFile.printf("Qrep,Qemp,B,N%n"); - for (int q=0; q ByQReportedQ = new ArrayList(); + ByQualFile.printf("Qrep,Qemp,Qrep_avg,B,N%n"); + RecalData All = new RecalData(0,0,""); + MeanReportedQuality AllReported = new MeanReportedQuality(); + for (int q=0; q { return 0; } - public int bases2dinucIndex(char prevBase, char base) { - return nuc2num[prevBase] * 4 + nuc2num[base]; + public int bases2dinucIndex(char prevBase, char base, boolean Complement) { + if (!Complement) { + return nuc2num[prevBase] * 4 + nuc2num[base]; + }else{ + return (3 - nuc2num[prevBase]) * 4 + (3 - nuc2num[base]); + } } public String dinucIndex2bases(int index) { @@ -266,6 +324,10 @@ public class CovariateCounterWalker extends LocusWalker { return new String( data ); } + public int string2dinucIndex(String s) { + return bases2dinucIndex(s.charAt(0), s.charAt(1), false); + } + static { nuc2num = new int[128]; nuc2num['A'] = 0; @@ -286,9 +348,12 @@ public class CovariateCounterWalker extends LocusWalker { Random random_genrator; // Print out data for regression public CovariateCounterWalker() throws FileNotFoundException { - ByQualFile = new PrintStream(OUTPUT_FILEROOT+".empirical_v_reported_quality.csv"); - ByCycleFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_cycle.csv"); - ByDinucFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_dinucleotide.csv"); + /*if (CREATE_TRAINING_DATA) + for ( int i=0; i