diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/recalibration/CovariateCounterWalker.java index 036f71931..c614ad20d 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -55,7 +55,6 @@ public class CovariateCounterWalker extends LocusWalker { @Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="") public boolean collapseDinuc = false; - int NDINUCS = 16; //ArrayList flattenData = new ArrayList(); //HashMap data = new HashMap(); HashMap data = new HashMap(); @@ -76,7 +75,7 @@ public class CovariateCounterWalker extends LocusWalker { if( !isSupportedReadGroup(readGroup) ) continue; String rg = readGroup.getReadGroupId(); - RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, NDINUCS, ! collapsePos, ! collapseDinuc ); + RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, RecalData.NDINUCS, ! collapsePos, ! collapseDinuc ); data.put(rg, manager); } } @@ -142,7 +141,11 @@ public class CovariateCounterWalker extends LocusWalker { int qual = quals[offset]; if ( qual > 0 && qual <= QualityUtils.MAX_QUAL_SCORE ) { // previous base is the next base in terms of machine chemistry if this is a negative strand - //System.out.printf("Adding b_offset=%c offset=%d cycle=%d qual=%d dinuc=%c%c ref_match=%c comp=%c%n", (char)read.getReadBases()[offset], offset, cycle, qual, prevBase, base, ref, (char)BaseUtils.simpleComplement(ref)); + // if ( qual == 2 ) + //if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") ) + // System.out.printf("Adding neg?=%b b_offset=%c offset=%d cycle=%d qual=%d dinuc=%c%c ref_match=%c comp=%c name=%s%n", + // read.getReadNegativeStrandFlag(), (char)read.getReadBases()[offset], offset, cycle, qual, prevBase, base, + // ref, (char)BaseUtils.simpleComplement(ref), read.getReadName()); RecalData datum = getRecalData(rg, cycle, qual, prevBase, base); if (datum != null) datum.inc(base,ref); return 1; @@ -174,7 +177,7 @@ public class CovariateCounterWalker extends LocusWalker { if (CREATE_TRAINING_DATA) writeTrainingData(); } - void printInfo(PrintStream out) { + private void printInfo(PrintStream out) { out.printf("# date %s%n", new Date()); out.printf("# collapsed_pos %b%n", collapsePos); out.printf("# collapsed_dinuc %b%n", collapseDinuc); @@ -185,16 +188,16 @@ public class CovariateCounterWalker extends LocusWalker { } - void writeTrainingData() { + private void writeTrainingData() { PrintStream dinuc_out = null; PrintStream table_out = null; try { dinuc_out = new PrintStream( OUTPUT_FILEROOT+".covariate_counts.csv"); dinuc_out.println("rg,dn,logitQ,pos,indicator,count"); for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) { - for ( int dinuc_index=0; dinuc_index 0) dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup.getReadGroupId(), RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 0, datum.N - datum.B); if (datum.B > 0) @@ -207,7 +210,7 @@ public class CovariateCounterWalker extends LocusWalker { if ( outputRawData ) { table_out = new PrintStream( OUTPUT_FILEROOT+".raw_data.csv"); printInfo(table_out); - table_out.println("rg,dn,Qrep,pos,NBases,MMismatches"); + table_out.println("rg,dn,Qrep,pos,NBases,MMismatches,Qemp"); for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) { for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) { if ( datum.N > 0 ) @@ -295,20 +298,20 @@ public class CovariateCounterWalker extends LocusWalker { ByDinucFile.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n"); RecalData All = new RecalData(0,0,readGroup.getReadGroupId(),""); MeanReportedQuality AllReported = new MeanReportedQuality(); - for (int c=0; c < NDINUCS; c++) { + for (int c=0; c < RecalData.NDINUCS; c++) { ByCycle.add(new RecalData(-1, -1,readGroup.getReadGroupId(),RecalData.dinucIndex2bases(c))); ByCycleReportedQ.add(new MeanReportedQuality()); } for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) { - int dinucIndex = RecalData.string2dinucIndex(datum.dinuc); //bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1), false); + int dinucIndex = RecalData.dinucIndex(datum.dinuc); //bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1), false); ByCycle.get(dinucIndex).inc(datum.N, datum.B); ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N); All.inc(datum.N, datum.B); AllReported.inc(datum.qual, datum.N); } - for (int c=0; c < NDINUCS; c++) { + for (int c=0; c < RecalData.NDINUCS; c++) { double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N); double reportedQual = ByCycleReportedQ.get(c).result(); ByDinucFile.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N); @@ -347,7 +350,7 @@ public class CovariateCounterWalker extends LocusWalker { for (int q=0; q { @Argument(shortName="logisticParams", doc="logistic params file", required=true) public String logisticParamsFile; @@ -185,7 +188,7 @@ public class LogisticRecalibrationWalker extends ReadWalker { @Argument(shortName="params", doc="CountCovariates params file", required=true) public String paramsFile; @@ -27,26 +32,52 @@ public class TableRecalibrationWalker extends ReadWalker [prevBase x base -> [cycle, qual, new qual]] HashMap cache = new HashMap(); - @Argument(shortName="serial", doc="", required=false) + //@Argument(shortName="serial", doc="", required=false) boolean serialRecalibration = false; + private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); + private static Pattern COLLAPSED_POS_PATTERN = Pattern.compile("^#\\s+collapsed_pos\\s+(\\w+)"); + private static Pattern COLLAPSED_DINUC_PATTERN = Pattern.compile("^#\\s+collapsed_dinuc\\s+(\\w+)"); + private static Pattern HEADER_PATTERN = Pattern.compile("^rg.*"); + public void initialize() { try { System.out.printf("Reading data...%n"); List data = new ArrayList(); + boolean collapsedPos = false; + boolean collapsedDinuc = false; + List lines = new xReadLines(new File(paramsFile)).readLines(); for ( String line : lines ) { - // rg,dn,logitQ,pos,indicator,count - // SRR007069,AA,28,1,0,2 - data.add(RecalData.fromCSVString(line)); + //System.out.printf("Reading line %s%n", line); + if ( HEADER_PATTERN.matcher(line).matches() ) + continue; + if ( COMMENT_PATTERN.matcher(line).matches() ) { + collapsedPos = parseCommentLine(COLLAPSED_POS_PATTERN, line, collapsedPos); + collapsedDinuc = parseCommentLine(COLLAPSED_DINUC_PATTERN, line, collapsedDinuc); + //System.out.printf("Collapsed %b %b%n", collapsedPos, collapsedDinuc); + } + else { + data.add(RecalData.fromCSVString(line)); + } } - initializeCache(data); + initializeCache(data, collapsedPos, collapsedDinuc); } catch ( FileNotFoundException e ) { Utils.scareUser("Cannot read/find parameters file " + paramsFile); } } - private void initializeCache(List data) { + private boolean parseCommentLine(Pattern pat, String line, boolean flag) { + Matcher m = pat.matcher(line); + if ( m.matches() ) { + //System.out.printf("Parsing %s%n", m.group(1)); + flag = Boolean.parseBoolean(m.group(1)); + } + + return flag; + } + + private void initializeCache(List data, boolean collapsedPos, boolean collapsedDinuc ) { Set readGroups = new HashSet(); Set dinucs = new HashSet(); int maxPos = -1; @@ -68,7 +99,7 @@ public class TableRecalibrationWalker extends ReadWalker managers = new HashMap(); for ( String readGroup : readGroups ) { - RecalDataManager manager = new RecalDataManager(readGroup, maxPos, maxQReported, dinucs.size(), true, true); + RecalDataManager manager = new RecalDataManager(readGroup, maxPos, maxQReported, dinucs.size(), ! collapsedPos, ! collapsedDinuc); managers.put(readGroup, manager); } @@ -90,13 +121,8 @@ public class TableRecalibrationWalker extends ReadWalker maxReadLen ) { - // throw new RuntimeException("Expectedly long read, please increase maxium read len with maxReadLen parameter: " + read.format()); - //} - byte[] bases = read.getReadBases(); byte[] quals = read.getBaseQualities(); - byte[] recalQuals = new byte[quals.length]; // Since we want machine direction reads not corrected positive strand reads, rev comp any negative strand reads if (read.getReadNegativeStrandFlag()) { @@ -104,28 +130,34 @@ public class TableRecalibrationWalker extends ReadWalker %d at %s%n", + // cycle, qual, read.getReadNegativeStrandFlag(), newQual, read.getReadName()); recalQuals[cycle] = newQual; //System.out.printf("Mapping %d => %d%n", qual, newQual); } - if (read.getReadNegativeStrandFlag()) - recalQuals = BaseUtils.reverse(quals); - //System.out.printf("OLD: %s%n", read.format()); - read.setBaseQualities(recalQuals); - //System.out.printf("NEW: %s%n", read.format()); - return read; + return recalQuals; } public void onTraversalDone(SAMFileWriter output) { @@ -164,9 +196,58 @@ interface RecalMapping { } class CombinatorialRecalMapping implements RecalMapping { - HashMap cache = new HashMap(); + ArrayList cache; + RecalDataManager manager; public CombinatorialRecalMapping(RecalDataManager manager, Set dinucs, int maxPos, int maxQReported ) { + this.manager = manager; + + // initialize the data structure + cache = new ArrayList(RecalData.NDINUCS); + for ( String dinuc : dinucs ) { + cache.add(new byte[maxPos+1][maxQReported+1]); + } + + for ( RecalData datum : manager.getAll() ) { + //System.out.printf("Adding datum %s%n", datum); + byte [][] table = cache.get(this.manager.getDinucIndex(datum.dinuc)); + if ( table[datum.pos][datum.qual] != 0 ) + throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum)); + //table[datum.pos][datum.qual] = (byte)(1 + datum.empiricalQualByte()); + table[datum.pos][datum.qual] = datum.empiricalQualByte(); + // System.out.printf("Binding %d %d => %d%n", datum.pos, datum.qual, datum.empiricalQualByte()); + } + } + + public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) { + //String dinuc = String.format("%c%c", (char)prevBase, (char)base); + //if ( qual == 2 ) + // System.out.printf("Qual = 2%n"); + + int pos = manager.canonicalPos(cycle); + int index = this.manager.getDinucIndex(prevBase, base); + byte[][] dataTable = index == -1 ? null : cache.get(index); + + if ( dataTable == null && prevBase != 'N' && base != 'N' ) + throw new RuntimeException(String.format("Unmapped data table at %s %c%c", readGroup, prevBase, base)); + + byte result = dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual; + + //if ( result == 2 ) + // System.out.printf("Lookup RG=%s dinuc=%s cycle=%d pos=%d qual=%d datatable=%s / %d => %d%n", + // readGroup, dinuc, cycle, pos, qual, dataTable, dataTable.length, result); + + return result; + } +} + +/*class CombinatorialRecalMapping implements RecalMapping { + HashMap cache = new HashMap(); + RecalDataManager manager; + + public CombinatorialRecalMapping(RecalDataManager manager, Set dinucs, int maxPos, int maxQReported ) { + this.manager = manager; + // initialize the data structure for ( String dinuc : dinucs ) { byte[][] table = new byte[maxPos+1][maxQReported+1]; @@ -180,22 +261,32 @@ class CombinatorialRecalMapping implements RecalMapping { throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum)); //table[datum.pos][datum.qual] = (byte)(1 + datum.empiricalQualByte()); table[datum.pos][datum.qual] = datum.empiricalQualByte(); + // System.out.printf("Binding %d %d => %d%n", datum.pos, datum.qual, datum.empiricalQualByte()); } } public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) { - //System.out.printf("Lookup RG=%s prevBase=%c base=%c cycle=%d qual=%d%n", readGroup, prevBase, base, cycle, qual); //String dinuc = String.format("%c%c", (char)prevBase, (char)base); + //if ( qual == 2 ) + // System.out.printf("Qual = 2%n"); + byte[] bp = {prevBase, base}; - String dinuc = new String(bp); + String dinuc = manager.canonicalDinuc(new String(bp)); + int pos = manager.canonicalPos(cycle); byte[][] dataTable = cache.get(dinuc); if ( dataTable == null && prevBase != 'N' && base != 'N' ) throw new RuntimeException(String.format("Unmapped data table at %s %s", readGroup, dinuc)); - return dataTable != null && cycle < dataTable.length ? dataTable[cycle][qual] : qual; + byte result = dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual; + + //if ( result == 2 ) + // System.out.printf("Lookup RG=%s dinuc=%s cycle=%d pos=%d qual=%d datatable=%s / %d => %d%n", + // readGroup, dinuc, cycle, pos, qual, dataTable, dataTable.length, result); + + return result; } -} +}*/ class SerialRecalMapping implements RecalMapping { // mapping from dinuc x Q => new Q