From 7d281296a7e0ebb8b9a64cafefb55ab82a08fd9b Mon Sep 17 00:00:00 2001 From: depristo Date: Wed, 17 Jun 2009 14:12:40 +0000 Subject: [PATCH] Finishing checking for building git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1027 348d0f76-0448-11de-a6fe-93d51630548a --- .../recalibration/CovariateCounterWalker.java | 47 +++++++++++------- .../gatk/walkers/recalibration/RecalData.java | 18 ++++--- .../recalibration/RecalDataManager.java | 49 +++++++++---------- .../broadinstitute/sting/utils/BaseUtils.java | 1 + .../sting/utils/QualityUtils.java | 4 +- 5 files changed, 67 insertions(+), 52 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/recalibration/CovariateCounterWalker.java index a12836050..036f71931 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -13,11 +13,7 @@ import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.BaseUtils; -import java.util.ArrayList; -import java.util.List; -import java.util.Random; -import java.util.HashMap; -import java.util.Collections; +import java.util.*; import java.io.PrintStream; import java.io.FileNotFoundException; @@ -53,13 +49,17 @@ public class CovariateCounterWalker extends LocusWalker { @Argument(fullName="rawData", shortName="raw", required=false, doc="If true, raw mismatch observations will be output to a file") public boolean outputRawData = true; + @Argument(fullName="collapsePos", shortName="collapsePos", required=false, doc="") + public boolean collapsePos = false; + + @Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="") + public boolean collapseDinuc = false; + int NDINUCS = 16; //ArrayList flattenData = new ArrayList(); //HashMap data = new HashMap(); HashMap data = new HashMap(); //RecalData[][][] data; - boolean trackPos = true; - boolean trackDinuc = true; long counted_sites = 0; // number of sites used to count covariates long counted_bases = 0; // number of bases used to count covariates @@ -76,14 +76,15 @@ public class CovariateCounterWalker extends LocusWalker { if( !isSupportedReadGroup(readGroup) ) continue; String rg = readGroup.getReadGroupId(); - RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, NDINUCS, trackPos, trackDinuc ); - //data.put(rg, new RecalData[maxReadLen+1][QualityUtils.MAX_QUAL_SCORE+1][NDINUCS]); + RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, NDINUCS, ! collapsePos, ! collapseDinuc ); data.put(rg, manager); } } - private RecalData getRecalData(String readGroup, int pos, int qual, int dinuc_index) { - return data.get(readGroup).expandingGetRecalData(pos, qual, dinuc_index, true); + private RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) { + byte[] cs = {(byte)prevBase, (byte)base}; + String s = new String(cs); + return data.get(readGroup).expandingGetRecalData(pos, qual, s, true); } private List getRecalData(String readGroup) { @@ -141,9 +142,9 @@ public class CovariateCounterWalker extends LocusWalker { int qual = quals[offset]; if ( qual > 0 && qual <= QualityUtils.MAX_QUAL_SCORE ) { // previous base is the next base in terms of machine chemistry if this is a negative strand - int dinuc_index = RecalData.bases2dinucIndex(prevBase, base, false); //System.out.printf("Adding b_offset=%c offset=%d cycle=%d qual=%d dinuc=%c%c ref_match=%c comp=%c%n", (char)read.getReadBases()[offset], offset, cycle, qual, prevBase, base, ref, (char)BaseUtils.simpleComplement(ref)); - getRecalData(rg, cycle, qual, dinuc_index).inc(base,ref); + RecalData datum = getRecalData(rg, cycle, qual, prevBase, base); + if (datum != null) datum.inc(base,ref); return 1; } else { return 0; @@ -168,14 +169,22 @@ public class CovariateCounterWalker extends LocusWalker { qualityDiffVsCycle(); qualityDiffVsDinucleotide(); - out.printf("Counted sites: %d%n", counted_sites); - out.printf("Counted bases: %d%n", counted_bases); - out.printf("Skipped sites: %d%n", skipped_sites); - out.printf("Fraction skipped: 1/%.0f%n", (double)counted_sites / skipped_sites); + printInfo(out); if (CREATE_TRAINING_DATA) writeTrainingData(); } + void printInfo(PrintStream out) { + out.printf("# date %s%n", new Date()); + out.printf("# collapsed_pos %b%n", collapsePos); + out.printf("# collapsed_dinuc %b%n", collapseDinuc); + out.printf("# counted_sites %d%n", counted_sites); + out.printf("# counted_bases %d%n", counted_bases); + out.printf("# skipped_sites %d%n", skipped_sites); + out.printf("# fraction_skipped 1/%.0f%n", (double)counted_sites / skipped_sites); + } + + void writeTrainingData() { PrintStream dinuc_out = null; PrintStream table_out = null; @@ -197,10 +206,12 @@ public class CovariateCounterWalker extends LocusWalker { if ( outputRawData ) { table_out = new PrintStream( OUTPUT_FILEROOT+".raw_data.csv"); + printInfo(table_out); + table_out.println("rg,dn,Qrep,pos,NBases,MMismatches"); for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) { for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) { if ( datum.N > 0 ) - table_out.format("%s%n", datum.toCSVString()); + table_out.format("%s%n", datum.toCSVString(collapsePos)); } } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/recalibration/RecalData.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/recalibration/RecalData.java index 662fe3a49..2643b1d79 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/recalibration/RecalData.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/recalibration/RecalData.java @@ -24,9 +24,9 @@ public class RecalData { B += incB; } - public int getDinucIndex() { - return string2dinucIndex(this.dinuc); - } + //public int getDinucIndex() { + // return string2dinucIndex(this.dinuc); + //} public void inc(char curBase, char ref) { inc(1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1); @@ -52,8 +52,8 @@ public class RecalData { return String.format("%3d,%s,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, readGroup, dinuc, qual, empiricalQual, qual-empiricalQual, N, B); } - public String toCSVString() { - return String.format("%s,%s,%d,%d,%d,%d", readGroup, dinuc, qual, pos, N, B); + public String toCSVString(boolean collapsedPos) { + return String.format("%s,%s,%d,%s,%d,%d", readGroup, dinuc, qual, collapsedPos ? "*" : pos, N, B); } public static RecalData fromCSVString(String s) { @@ -61,7 +61,8 @@ public class RecalData { String rg = vals[0]; String dinuc = vals[1]; int qual = Integer.parseInt(vals[2]); - int pos = Integer.parseInt(vals[3]); + + int pos = vals[3].equals("*") ? 0 : Integer.parseInt(vals[3]); int N = Integer.parseInt(vals[4]); int B = Integer.parseInt(vals[5]); RecalData datum = new RecalData(pos, qual, rg, dinuc); @@ -74,9 +75,12 @@ public class RecalData { } public static int bases2dinucIndex(char prevBase, char base, boolean Complement) { + if ( BaseUtils.simpleBaseToBaseIndex(prevBase) == -1 || BaseUtils.simpleBaseToBaseIndex(base) == -1 ) + return -1; + if (!Complement) { return BaseUtils.simpleBaseToBaseIndex(prevBase) * 4 + BaseUtils.simpleBaseToBaseIndex(base); - }else{ + }else { return (3 - BaseUtils.simpleBaseToBaseIndex(prevBase)) * 4 + (3 - BaseUtils.simpleBaseToBaseIndex(base)); } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/recalibration/RecalDataManager.java index c6fc3f4e1..c20212149 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/recalibration/RecalDataManager.java @@ -1,21 +1,8 @@ package org.broadinstitute.sting.playground.gatk.walkers.recalibration; -import org.broadinstitute.sting.gatk.walkers.WalkerName; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.rodDbSNP; -import org.broadinstitute.sting.gatk.LocusContext; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.BaseUtils; import java.util.*; -import java.io.PrintStream; -import java.io.FileNotFoundException; - -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; /** * Created by IntelliJ IDEA. @@ -34,7 +21,7 @@ public class RecalDataManager { public RecalDataManager(String readGroup, int maxReadLen, int maxQual, int nDinucs, boolean trackPos, boolean trackDinuc) { - data = new RecalData[maxReadLen+1][QualityUtils.MAX_QUAL_SCORE+1][nDinucs]; + data = new RecalData[maxReadLen+1][maxQual+1][nDinucs]; this.readGroup = readGroup; this.trackPos = trackPos; this.trackDinuc = trackDinuc; @@ -46,8 +33,12 @@ public class RecalDataManager { return trackPos ? pos : 0; } - public int getDinucIndex(int dinuc) { - return trackDinuc ? dinuc : 0; + public int getDinucIndex(String dinuc) { + if ( trackDinuc ) { + return RecalData.string2dinucIndex(dinuc); + } else { + return 0; + } } public void addDatum(RecalData datum) { @@ -55,27 +46,33 @@ public class RecalDataManager { throw new RuntimeException(String.format("BUG: adding incorrect read group datum %s to RecalDataManager for %s", datum.readGroup, this.readGroup)); } - if ( getRecalData(datum.pos, datum.qual, datum.getDinucIndex()) != null ) - throw new RuntimeException(String.format("Duplicate entry discovered: %s vs. %s", getRecalData(datum.pos, datum.qual, datum.getDinucIndex()), datum)); + if ( getRecalData(datum.pos, datum.qual, datum.dinuc) != null ) + throw new RuntimeException(String.format("Duplicate entry discovered: %s vs. %s", getRecalData(datum.pos, datum.qual, datum.dinuc), datum)); + int posIndex = getPosIndex(datum.pos); - int internalDinucIndex = getDinucIndex(datum.getDinucIndex()); + int internalDinucIndex = getDinucIndex(datum.dinuc); + + if ( internalDinucIndex == -1 ) return; + data[posIndex][datum.qual][internalDinucIndex] = datum; flattenData.add(datum); } - public RecalData getRecalData(int pos, int qual, int dinuc_index) { - return expandingGetRecalData(pos, qual, dinuc_index, false); + public RecalData getRecalData(int pos, int qual, String dinuc) { + return expandingGetRecalData(pos, qual, dinuc, false); } - public RecalData expandingGetRecalData(int pos, int qual, int dinuc_index, boolean expandP) { + public RecalData expandingGetRecalData(int pos, int qual, String dinuc, boolean expandP) { int posIndex = getPosIndex(pos); - int internalDinucIndex = getDinucIndex(dinuc_index); + int internalDinucIndex = getDinucIndex(dinuc); + + if ( internalDinucIndex == -1 ) return null; RecalData datum = data[posIndex][qual][internalDinucIndex]; if ( datum == null && expandP ) { //System.out.printf("Allocating %s %d %d %d%n", readGroup, pos, qual, dinuc_index); - datum = new RecalData(posIndex, qual, readGroup, RecalData.dinucIndex2bases(dinuc_index)); + datum = new RecalData(posIndex, qual, readGroup, trackDinuc ? dinuc : "**"); data[posIndex][qual][internalDinucIndex] = datum; flattenData.add(datum); } @@ -108,7 +105,7 @@ public class RecalDataManager { for ( int qual = 0; qual < QualityUtils.MAX_QUAL_SCORE+1; qual++ ) { RecalData datum = new RecalData(pos, qual, readGroup, "**"); for ( int dinucIndex = 0; dinucIndex < nDinucs; dinucIndex++ ) { - RecalData datum2 = getRecalData(pos, qual, dinucIndex); + RecalData datum2 = getRecalData(pos, qual, RecalData.dinucIndex2bases(dinucIndex)); if ( datum2 != null ) datum.inc(data[pos][qual][dinucIndex].N, data[pos][qual][dinucIndex].B); } @@ -128,7 +125,7 @@ public class RecalDataManager { RecalData datum = new RecalData(-1, qual, readGroup, RecalData.dinucIndex2bases(dinucIndex)); System.out.printf("Aggregating [%s]:%n", datum); for ( int pos = 0; pos < data.length; pos++ ) { - RecalData datum2 = getRecalData(pos, qual, dinucIndex); + RecalData datum2 = getRecalData(pos, qual, RecalData.dinucIndex2bases(dinucIndex)); if ( datum2 != null ) { System.out.printf(" + [%s]:%n", datum2); datum.inc(data[pos][qual][dinucIndex].N, data[pos][qual][dinucIndex].B); diff --git a/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 0bf5aecbb..3061b4fa0 100644 --- a/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -54,6 +54,7 @@ public class BaseUtils { */ static public int simpleBaseToBaseIndex(char base) { switch (base) { + case '*': // the wildcard character counts as an A case 'A': case 'a': return 0; diff --git a/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/java/src/org/broadinstitute/sting/utils/QualityUtils.java index 42d00c8e6..fc451aec3 100755 --- a/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -7,6 +7,8 @@ package org.broadinstitute.sting.utils; * @author Kiran Garimella */ public class QualityUtils { + public final static byte MAX_QUAL_SCORE = 63; + /** * Private constructor. No instantiating this class! */ @@ -70,7 +72,7 @@ public class QualityUtils { * @return the capped quality score */ static public byte boundQual(int qual) { - return (byte) Math.min(qual, 63); + return (byte) Math.min(qual, MAX_QUAL_SCORE); } /**