From 8ac40e8e2dc831e9f97b9021e967d6c9ff5463f6 Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 19 Jun 2009 17:45:47 +0000 Subject: [PATCH] Updated version of the recalibration tool git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1060 348d0f76-0448-11de-a6fe-93d51630548a --- .../recalibration/CovariateCounterWalker.java | 287 +++++++++--------- .../recalibration/RecalDataManager.java | 74 ++++- .../TableRecalibrationWalker.java | 17 +- .../sting/utils/ExpandingArrayList.java | 34 +++ .../walkers/recalibration/RecalDataTest.java | 156 ++++++++++ .../sting/utils/ExpandingArrayListTest.java | 153 ++++++++++ python/Gelis2PopSNPs.py | 11 +- python/RecalQual.py | 14 +- python/farm_commands.py | 13 +- python/gatkConfigParser.py | 93 ++++++ testdata/defaultGATKConfig.cfg | 23 ++ 11 files changed, 699 insertions(+), 176 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/utils/ExpandingArrayList.java create mode 100755 java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataTest.java create mode 100755 java/test/org/broadinstitute/sting/utils/ExpandingArrayListTest.java create mode 100755 python/gatkConfigParser.py create mode 100755 testdata/defaultGATKConfig.cfg diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java index c7bbfdb75..fc4a6d0ab 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -19,8 +19,8 @@ import java.io.FileNotFoundException; @WalkerName("CountCovariates") public class CovariateCounterWalker extends LocusWalker { - @Argument(fullName="maxReadLen", shortName="mrl", doc="max read length", required=false) - public int maxReadLen = 101; + @Argument(fullName="buggyMaxReadLen", doc="If we see a read longer than this, we assume there's a bug and abort", required=false) + public int buggyMaxReadLen = 100000; //@Argument(fullName="MAX_QUAL_SCORE", shortName="mqs", doc="max quality score", required=false) //public int MAX_QUAL_SCORE = 63; @@ -28,11 +28,11 @@ public class CovariateCounterWalker extends LocusWalker { @Argument(fullName="OUTPUT_FILEROOT", shortName="outroot", required=false, doc="Filename root for the outputted logistic regression training files") public String OUTPUT_FILEROOT = "output"; - @Argument(fullName="CREATE_TRAINING_DATA", shortName="trainingdata", required=false, doc="Create training data files for logistic regression") - public boolean CREATE_TRAINING_DATA = true; + //@Argument(fullName="CREATE_TRAINING_DATA", shortName="trainingdata", required=false, doc="Create training data files for logistic regression") + //public boolean CREATE_TRAINING_DATA = true; - @Argument(fullName="DOWNSAMPLE_FRACTION", shortName="sample", required=false, doc="Fraction of bases to randomly sample") - public float DOWNSAMPLE_FRACTION=1.0f; + //@Argument(fullName="DOWNSAMPLE_FRACTION", shortName="sample", required=false, doc="Fraction of bases to randomly sample") + //public float DOWNSAMPLE_FRACTION=1.0f; @Argument(fullName="MIN_MAPPING_QUALITY", shortName="minmap", required=false, doc="Only use reads with at least this quality score") public int MIN_MAPPING_QUALITY = 0; @@ -47,8 +47,11 @@ public class CovariateCounterWalker extends LocusWalker { public List platforms = Collections.singletonList("*"); //public List platforms = Collections.singletonList("ILLUMINA"); - @Argument(fullName="rawData", shortName="raw", required=false, doc="If true, raw mismatch observations will be output to a file") - public boolean outputRawData = true; + //@Argument(fullName="rawData", shortName="raw", required=false, doc="If true, raw mismatch observations will be output to a file") + //public boolean outputRawData = true; + + @Argument(fullName="writeDetailedAnalysisFiles", required=false, doc="If true, we'll write detailed analysis files out at the end of covariate calcuilations") + public boolean writeDetailedAnalysisFiles = false; @Argument(fullName="collapsePos", shortName="collapsePos", required=false, doc="") public boolean collapsePos = false; @@ -76,9 +79,11 @@ public class CovariateCounterWalker extends LocusWalker { if( !isSupportedReadGroup(readGroup) ) continue; String rg = readGroup.getReadGroupId(); - RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, RecalData.NDINUCS, ! collapsePos, ! collapseDinuc ); + //RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, RecalData.NDINUCS, ! collapsePos, ! collapseDinuc ); + RecalDataManager manager = new RecalDataManager(rg, ! collapsePos, ! collapseDinuc ); data.put(rg, manager); } + out.printf("Created recalibration data collectors for %d read group(s)%n", data.size()); } private RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) { @@ -100,7 +105,7 @@ public class CovariateCounterWalker extends LocusWalker { for (int i =0; i < reads.size(); i++ ) { SAMRecord read = reads.get(i); - if ( read.getReadLength() > maxReadLen ) { + if ( read.getReadLength() > buggyMaxReadLen ) { throw new RuntimeException("Expectedly long read, please increase maxium read len with maxReadLen parameter: " + read.format()); } @@ -140,7 +145,7 @@ public class CovariateCounterWalker extends LocusWalker { } int qual = quals[offset]; - if ( qual > 0 && qual <= QualityUtils.MAX_QUAL_SCORE ) { + if ( qual > 0 ) { // && qual <= QualityUtils.MAX_QUAL_SCORE ) { // previous base is the next base in terms of machine chemistry if this is a negative strand // if ( qual == 2 ) //if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") ) @@ -156,6 +161,7 @@ public class CovariateCounterWalker extends LocusWalker { } public void onTraversalDone(Integer result) { + /* PrintStream covars_out; try { covars_out = new PrintStream(OUTPUT_FILEROOT+".covars.out"); @@ -168,14 +174,11 @@ public class CovariateCounterWalker extends LocusWalker { } catch (FileNotFoundException e) { System.err.println("FileNotFoundException: " + e.getMessage()); } - - qualityEmpiricalObserved(); - qualityDiffVsCycle(); - qualityDiffVsDinucleotide(); + */ printInfo(out); - - if (CREATE_TRAINING_DATA) writeTrainingData(); + writeTrainingData(); + writeAnalysisData(); } private void printInfo(PrintStream out) { @@ -190,8 +193,17 @@ public class CovariateCounterWalker extends LocusWalker { private void writeTrainingData() { + out.printf("Writing raw recalibration data%n"); + writeRawTable(); + out.printf("...done%n"); + + out.printf("Writing logistic recalibration data%n"); + writeLogisticRecalibrationTable(); + out.printf("...done%n"); + } + + private void writeLogisticRecalibrationTable() { PrintStream dinuc_out = null; - PrintStream table_out = null; try { dinuc_out = new PrintStream( OUTPUT_FILEROOT+".covariate_counts.csv"); dinuc_out.println("rg,dn,logitQ,pos,indicator,count"); @@ -207,28 +219,62 @@ public class CovariateCounterWalker extends LocusWalker { } } } - - if ( outputRawData ) { - table_out = new PrintStream( OUTPUT_FILEROOT+".raw_data.csv"); - printInfo(table_out); - table_out.println("rg,dn,Qrep,pos,NBases,MMismatches,Qemp"); - for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) { - for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) { - if ( datum.N > 0 ) - table_out.format("%s%n", datum.toCSVString(collapsePos)); - } - } - } } catch (FileNotFoundException e) { System.err.println("FileNotFoundException: " + e.getMessage()); - return; } finally { if (dinuc_out != null) dinuc_out.close(); - if (table_out != null) table_out.close(); } + } + private void writeRawTable() { + PrintStream table_out = null; + try { + table_out = new PrintStream( OUTPUT_FILEROOT+".raw_data.csv"); + printInfo(table_out); + table_out.println("rg,dn,Qrep,pos,NBases,MMismatches,Qemp"); + for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) { + for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) { + if ( datum.N > 0 ) + table_out.format("%s%n", datum.toCSVString(collapsePos)); + } + } + } catch (FileNotFoundException e ) { + System.err.println("FileNotFoundException: " + e.getMessage()); + } + finally { + if (table_out != null) table_out.close(); + } + } + + + // -------------------------------------------------------------------------------------------------------------- + // + // Analysis data + // + // -------------------------------------------------------------------------------------------------------------- + private void writeAnalysisData() { + if ( writeDetailedAnalysisFiles ) { + for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) { + PrintStream DiffVsCycleFile = null, DiffVsDinucFile = null, EmpObsFile = null; + try { + DiffVsCycleFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".quality_difference_v_cycle.csv"); + DiffVsDinucFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".quality_difference_v_dinucleotide.csv"); + EmpObsFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".empirical_v_reported_quality.csv"); + + qualityEmpiricalObserved(EmpObsFile, readGroup.getReadGroupId()); + qualityDiffVsCycle(DiffVsCycleFile, readGroup.getReadGroupId()); + qualityDiffVsDinucleotide(DiffVsDinucFile, readGroup.getReadGroupId()); + } catch (FileNotFoundException e){ + throw new RuntimeException("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT, e); + } finally { + if ( DiffVsCycleFile != null) DiffVsCycleFile.close(); + if ( DiffVsDinucFile != null) DiffVsDinucFile.close(); + if ( EmpObsFile != null) EmpObsFile.close(); + } + } + } } class MeanReportedQuality { @@ -249,114 +295,83 @@ public class CovariateCounterWalker extends LocusWalker { } } - public void qualityDiffVsCycle() { - for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) { - PrintStream ByCycleFile = null; - try { - ByCycleFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".quality_difference_v_cycle.csv"); - } catch (FileNotFoundException e){ - System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT); - System.exit(1); - } - ArrayList ByCycle = new ArrayList(); - ArrayList ByCycleReportedQ = new ArrayList(); - ByCycleFile.printf("cycle,Qemp-obs,Qemp,Qobs,B,N%n"); - RecalData All = new RecalData(0,0,readGroup.getReadGroupId(),""); - MeanReportedQuality AllReported = new MeanReportedQuality(); - for (int c=0; c < maxReadLen+1; c++) { - ByCycle.add(new RecalData(c, -1,readGroup.getReadGroupId(),"-")); - ByCycleReportedQ.add(new MeanReportedQuality()); - } - - for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) { - ByCycle.get(datum.pos).inc(datum.N, datum.B); - ByCycleReportedQ.get(datum.pos).inc(datum.qual, datum.N); - All.inc(datum.N, datum.B); - AllReported.inc(datum.qual, datum.N); - } - - for (int c=0; c < maxReadLen+1; c++) { - double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N); - double reportedQual = ByCycleReportedQ.get(c).result(); - ByCycleFile.printf("%d, %f, %f, %f, %d, %d%n", c, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N); - } + public void qualityDiffVsCycle(PrintStream file, final String readGroup) { + ArrayList ByCycle = new ArrayList(); + ArrayList ByCycleReportedQ = new ArrayList(); + file.printf("cycle,Qemp-obs,Qemp,Qobs,B,N%n"); + RecalData All = new RecalData(0,0,readGroup,""); + MeanReportedQuality AllReported = new MeanReportedQuality(); + RecalDataManager manager = data.get(readGroup); + int maxReadLen = manager.getMaxReadLen(); + for (int c=0; c < maxReadLen+1; c++) { + ByCycle.add(new RecalData(c, -1,readGroup,"-")); + ByCycleReportedQ.add(new MeanReportedQuality()); + } + + for ( RecalData datum: getRecalData(readGroup) ) { + ByCycle.get(datum.pos).inc(datum.N, datum.B); + ByCycleReportedQ.get(datum.pos).inc(datum.qual, datum.N); + All.inc(datum.N, datum.B); + AllReported.inc(datum.qual, datum.N); + } + + for (int c=0; c < maxReadLen+1; c++) { + double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N); + double reportedQual = ByCycleReportedQ.get(c).result(); + file.printf("%d, %f, %f, %f, %d, %d%n", c, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N); } - //System.out.printf("Cycle: N=%d, B=%d, Qemp=%.1f, ", All.N, All.B, -10 * Math.log10((double)All.B/All.N)); - //System.out.printf("Qrep=%.1f%n", AllReported.result()); } - public void qualityDiffVsDinucleotide() { - for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) { - PrintStream ByDinucFile = null; - try { - ByDinucFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".quality_difference_v_dinucleotide.csv"); - } catch (FileNotFoundException e){ - System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT); - System.exit(1); - } - ArrayList ByCycle = new ArrayList(); - ArrayList ByCycleReportedQ = new ArrayList(); - ByDinucFile.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n"); - RecalData All = new RecalData(0,0,readGroup.getReadGroupId(),""); - MeanReportedQuality AllReported = new MeanReportedQuality(); - for (int c=0; c < RecalData.NDINUCS; c++) { - ByCycle.add(new RecalData(-1, -1,readGroup.getReadGroupId(),RecalData.dinucIndex2bases(c))); - ByCycleReportedQ.add(new MeanReportedQuality()); - } - - for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) { - int dinucIndex = RecalData.dinucIndex(datum.dinuc); //bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1), false); - ByCycle.get(dinucIndex).inc(datum.N, datum.B); - ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N); - All.inc(datum.N, datum.B); - AllReported.inc(datum.qual, datum.N); - } - - for (int c=0; c < RecalData.NDINUCS; c++) { - double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N); - double reportedQual = ByCycleReportedQ.get(c).result(); - ByDinucFile.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N); - } + public void qualityDiffVsDinucleotide(PrintStream file, final String readGroup) { + ArrayList ByCycle = new ArrayList(); + ArrayList ByCycleReportedQ = new ArrayList(); + file.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n"); + RecalData All = new RecalData(0,0,readGroup,""); + MeanReportedQuality AllReported = new MeanReportedQuality(); + for (int c=0; c < RecalData.NDINUCS; c++) { + ByCycle.add(new RecalData(-1, -1,readGroup,RecalData.dinucIndex2bases(c))); + ByCycleReportedQ.add(new MeanReportedQuality()); + } + + for ( RecalData datum: getRecalData(readGroup) ) { + int dinucIndex = RecalData.dinucIndex(datum.dinuc); //bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1), false); + ByCycle.get(dinucIndex).inc(datum.N, datum.B); + ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N); + All.inc(datum.N, datum.B); + AllReported.inc(datum.qual, datum.N); + } + + for (int c=0; c < RecalData.NDINUCS; c++) { + double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N); + double reportedQual = ByCycleReportedQ.get(c).result(); + file.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N); } - //System.out.printf("Dinuc: N=%d, B=%d, Qemp=%.1f, ", All.N, All.B, -10 * Math.log10((double)All.B/All.N)); - //System.out.printf("Qrep=%.1f%n", AllReported.result()); } - public void qualityEmpiricalObserved() { - for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) { - PrintStream ByQualFile = null; - try { - ByQualFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".empirical_v_reported_quality.csv"); - } catch (FileNotFoundException e){ - System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT); - System.exit(1); - } - ArrayList ByQ = new ArrayList(); - ArrayList ByQReportedQ = new ArrayList(); - ByQualFile.printf("Qrep,Qemp,Qrep_avg,B,N%n"); - RecalData All = new RecalData(0,0,readGroup.getReadGroupId(),""); - MeanReportedQuality AllReported = new MeanReportedQuality(); - for (int q=0; q ByQ = new ArrayList(); + ArrayList ByQReportedQ = new ArrayList(); + file.printf("Qrep,Qemp,Qrep_avg,B,N%n"); + RecalData All = new RecalData(0,0,readGroup,""); + MeanReportedQuality AllReported = new MeanReportedQuality(); + for (int q=0; q { return 0; } - Random random_genrator; - // Print out data for regression - public CovariateCounterWalker() throws FileNotFoundException { - random_genrator = new Random(123454321); // keep same random seed while debugging - } - /** * Check to see whether this read group should be processed. * @param readGroup diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index b37b65e49..54cc64307 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.ExpandingArrayList; import java.util.*; @@ -13,20 +14,28 @@ import java.util.*; */ public class RecalDataManager { ArrayList flattenData = new ArrayList(); - RecalData[][][] data = null; boolean trackPos, trackDinuc; String readGroup; int nDinucs, maxReadLen; + //RecalData[][][] data = null; + ExpandingArrayList>> data = null; + public RecalDataManager(String readGroup, - int maxReadLen, int maxQual, int nDinucs, + //int maxReadLen, int maxQual, int nDinucs, boolean trackPos, boolean trackDinuc) { - data = new RecalData[maxReadLen+1][maxQual+1][nDinucs]; + data = new ExpandingArrayList>>(); + //data = new RecalData[maxReadLen+1][maxQual+1][nDinucs]; + this.readGroup = readGroup; this.trackPos = trackPos; this.trackDinuc = trackDinuc; - this.maxReadLen = maxReadLen; - this.nDinucs = nDinucs; + //this.maxReadLen = maxReadLen; + //this.nDinucs = nDinucs; + } + + public int getMaxReadLen() { + return data.size(); } public int getPosIndex(int pos) { @@ -58,13 +67,11 @@ public class RecalDataManager { if ( getRecalData(datum.pos, datum.qual, datum.dinuc) != null ) throw new RuntimeException(String.format("Duplicate entry discovered: %s vs. %s", getRecalData(datum.pos, datum.qual, datum.dinuc), datum)); - int posIndex = getPosIndex(datum.pos); int internalDinucIndex = getDinucIndex(datum.dinuc); - if ( internalDinucIndex == -1 ) return; - data[posIndex][datum.qual][internalDinucIndex] = datum; + set(posIndex, datum.qual, internalDinucIndex, datum); flattenData.add(datum); } @@ -78,18 +85,61 @@ public class RecalDataManager { if ( internalDinucIndex == -1 ) return null; - RecalData datum = data[posIndex][qual][internalDinucIndex]; + RecalData datum = get(posIndex, qual, internalDinucIndex); if ( datum == null && expandP ) { //System.out.printf("Allocating %s %d %d %d%n", readGroup, pos, qual, dinuc_index); datum = new RecalData(posIndex, qual, readGroup, trackDinuc ? dinuc : "**"); - data[posIndex][qual][internalDinucIndex] = datum; + set(posIndex, qual, internalDinucIndex, datum); flattenData.add(datum); } return datum; } - public List select(int pos, int qual, int dinuc_index ) { + /** + * Returns the RecalData associated with pos, qual, dinuc, or null if not is bound. + * Does not expand the system to corporate a new data point + * @param posIndex + * @param qual + * @param dinucIndex + * @return + */ + private RecalData get(int posIndex, int qual, int dinucIndex) { + ExpandingArrayList> d2 = data.get(posIndex); + if (d2 == null) return null; + ExpandingArrayList d3 = d2.get(qual); + return d3 == null ? null : d3.get(dinucIndex); + } + + //private RecalData get(int posIndex, int qual, int dinucIndex) { + // return data[posIndex][qual][dinucIndex]; + //} + + private void set(int posIndex, int qual, int dinucIndex, RecalData datum) { + // grow data if necessary + ExpandingArrayList> d2 = data.get(posIndex); + if (d2 == null) { + d2 = new ExpandingArrayList>(); + data.set(posIndex, d2); + } + + // Expand d2 if necessary + ExpandingArrayList d3 = d2.get(qual); + if ( d3 == null ) { + d3 = new ExpandingArrayList(); + d2.set(qual, d3); + } + + // set d3 to datum + d3.set(dinucIndex, datum); + } + + //private void set(int posIndex, int qual, int dinucIndex, RecalData datum) { + // data[posIndex][qual][dinucIndex] = datum; + //} + + + /* public List select(int pos, int qual, int dinuc_index ) { List l = new LinkedList(); for ( int i = 0; i < data.length; i++ ) { if ( i == pos || pos == -1 || ! trackPos ) { @@ -147,7 +197,7 @@ public class RecalDataManager { System.out.printf("getDataByDinuc => %d%n", l.size()); return l; - } + }*/ public List getAll() { return flattenData; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index 30af5e8ff..9f8b6179b 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -32,7 +32,7 @@ public class TableRecalibrationWalker extends ReadWalker cache = new HashMap(); //@Argument(shortName="serial", doc="", required=false) - boolean serialRecalibration = false; + //boolean serialRecalibration = false; private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); private static Pattern COLLAPSED_POS_PATTERN = Pattern.compile("^#\\s+collapsed_pos\\s+(\\w+)"); @@ -98,7 +98,8 @@ public class TableRecalibrationWalker extends ReadWalker managers = new HashMap(); for ( String readGroup : readGroups ) { - RecalDataManager manager = new RecalDataManager(readGroup, maxPos, maxQReported, dinucs.size(), ! collapsedPos, ! collapsedDinuc); + RecalDataManager manager = new RecalDataManager(readGroup, ! collapsedPos, ! collapsedDinuc); + //RecalDataManager manager = new RecalDataManager(readGroup, maxPos, maxQReported, dinucs.size(), ! collapsedPos, ! collapsedDinuc); managers.put(readGroup, manager); } @@ -111,10 +112,10 @@ public class TableRecalibrationWalker extends ReadWalker new Q HashMap mappingByDinuc; @@ -338,4 +339,4 @@ class SerialRecalMapping implements RecalMapping { return newQual; } -} \ No newline at end of file +}*/ diff --git a/java/src/org/broadinstitute/sting/utils/ExpandingArrayList.java b/java/src/org/broadinstitute/sting/utils/ExpandingArrayList.java new file mode 100755 index 000000000..10f3f8c95 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/ExpandingArrayList.java @@ -0,0 +1,34 @@ +package org.broadinstitute.sting.utils; + +import java.util.ArrayList; +import java.util.Collection; + +public class ExpandingArrayList extends ArrayList { + public ExpandingArrayList() { super(); } + public ExpandingArrayList(Collection c) { super(c); } + public ExpandingArrayList(int initialCapacity) { super(initialCapacity); } + + /** + * Returns the element at the specified position in this list. If index > size, + * returns null. Otherwise tries to access the array + * @param index + * @return + * @throws IndexOutOfBoundsException in index < 0 + */ + public E get(int index) throws IndexOutOfBoundsException { + if ( index < size() ) + return super.get(index); + else + return null; + } + + public E set(int index, E element) { + if ( index >= size() ) { + // We need to add null items until we can safely set index to element + for ( int i = size(); i <= index; i++ ) + add(null); + } + + return super.set(index, element); + } +} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataTest.java new file mode 100755 index 000000000..dc0df6721 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataTest.java @@ -0,0 +1,156 @@ +// our package +package org.broadinstitute.sting.gatk.walkers.recalibration; + + +// the imports for unit testing. + +import org.junit.Assert; +import org.junit.Test; +import org.junit.Before; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.QualityUtils; +import java.util.HashSet; + +/** + * Basic unit test for RecalData + */ +public class RecalDataTest extends BaseTest { + RecalData datum, starDatum; + + /** + * Tests that we got a string parameter in correctly + */ + @Before + public void before() { + datum = new RecalData(2, 3, "0", "AA"); + datum.B = datum.N = 1; + starDatum = new RecalData(2, 3, "0", "**"); + starDatum.B = starDatum.N = 1; + } + + @Test + public void testBasic() { + logger.warn("Executing testIsBetween"); + + Assert.assertTrue(datum.N == 1); + Assert.assertTrue(datum.B == 1); + Assert.assertTrue(datum.pos == 2); + Assert.assertTrue(datum.qual == 3); + Assert.assertTrue(datum.readGroup.equals("0")); + Assert.assertTrue(datum.dinuc.equals("AA")); + + Assert.assertTrue(starDatum.N == 1); + Assert.assertTrue(starDatum.B == 1); + Assert.assertTrue(starDatum.pos == 2); + Assert.assertTrue(starDatum.qual == 3); + Assert.assertTrue(starDatum.readGroup.equals("0")); + Assert.assertTrue(starDatum.dinuc.equals("**")); + } + + @Test + public void testInc() { + logger.warn("Executing testInc"); + + datum.inc(1L, 0L); + Assert.assertTrue(datum.N == 2); + Assert.assertTrue(datum.B == 1); + + datum.inc('A', 'A'); + Assert.assertTrue(datum.N == 3); + Assert.assertTrue(datum.B == 1); + + datum.inc('A', 'C'); + Assert.assertTrue(datum.N == 4); + Assert.assertTrue(datum.B == 2); + } + + + @Test + public void testEmpQual() { + logger.warn("Executing testEmpQual"); + + datum.B = 1; + datum.N = 1; + Assert.assertEquals(datum.empiricalQualDouble(), 0.0, 1e-5); + Assert.assertEquals(datum.empiricalQualByte(), 0); + + datum.B = 1; + datum.N = 2; + Assert.assertEquals(datum.empiricalQualDouble(), 3.0103, 1e-5); + Assert.assertEquals(datum.empiricalQualByte(), 3); + + datum.B = 1; + datum.N = 3; + Assert.assertEquals(datum.empiricalQualDouble(), 4.771213, 1e-5); + Assert.assertEquals(datum.empiricalQualByte(), 5); + + datum.B = 1; + datum.B = 2; + Assert.assertEquals(datum.empiricalQualDouble(), 1.760913, 1e-5); + Assert.assertEquals(datum.empiricalQualByte(), 2); + + datum.B = 1; + datum.N = 10; + Assert.assertEquals(datum.empiricalQualDouble(), 10.0, 1e-5); + Assert.assertEquals(datum.empiricalQualByte(), 10); + + datum.B = 1; + datum.N = 100; + Assert.assertEquals(datum.empiricalQualDouble(), 20.0, 1e-5); + Assert.assertEquals(datum.empiricalQualByte(), 20); + + datum.B = 1; + datum.N = 1000; + Assert.assertEquals(datum.empiricalQualDouble(), 30.0, 1e-5); + Assert.assertEquals(datum.empiricalQualByte(), 30); + + datum.B = 0; + datum.N = 1000; + Assert.assertEquals(datum.empiricalQualDouble(), QualityUtils.MAX_REASONABLE_Q_SCORE, 1e-5); + Assert.assertEquals(datum.empiricalQualByte(), QualityUtils.MAX_REASONABLE_Q_SCORE); + } + + + public void testtoCSVString() { + logger.warn("Executing testtoCSVString"); + + Assert.assertEquals(datum.toCSVString(false), "0,AA,3,2,1,1,0"); + Assert.assertEquals(datum.toCSVString(true), "0,AA,3,*,1,1,0"); + } + + + public void testFromCSVString() { + logger.warn("Executing testFromCSVString"); + + Assert.assertEquals(RecalData.fromCSVString("0,AA,3,2,1,1,0").toCSVString(false), datum.toCSVString(false)); + Assert.assertEquals(RecalData.fromCSVString("0,AA,3,*,1,1,0").toCSVString(false), datum.toCSVString(true)); + Assert.assertEquals(RecalData.fromCSVString("0,**,3,2,1,1,0").toCSVString(false), starDatum.toCSVString(false)); + Assert.assertEquals(RecalData.fromCSVString("0,**,3,*,1,1,0").toCSVString(false), starDatum.toCSVString(true)); + } + + public void testDinucIndex() { + logger.warn("Executing testDinucIndex"); + + HashSet indices = new HashSet(); + HashSet unknownBytes = new HashSet(); + byte bases[] = {'A', 'C', 'G', 'T', '*', 'N'}; + unknownBytes.add((byte)'*'); + unknownBytes.add((byte)'N'); + + for ( int i = 0; i < bases.length; i++ ) { + for ( int j = 0; j < bases.length; j++ ) { + byte[] bp = {bases[i], bases[j]}; + String s = new String(bp); + int index = RecalData.dinucIndex(s); + indices.add(index); + Assert.assertEquals(index, RecalData.dinucIndex(bases[i], bases[j])); + if ( index != -1 ) { + Assert.assertEquals(RecalData.dinucIndex2bases(index), s); + } else { + Assert.assertTrue(unknownBytes.contains(bp[0]) || unknownBytes.contains(bp[1]) ); + } + } + } + Assert.assertEquals(indices.size(), 17); + } +} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/utils/ExpandingArrayListTest.java b/java/test/org/broadinstitute/sting/utils/ExpandingArrayListTest.java new file mode 100755 index 000000000..3f0d91760 --- /dev/null +++ b/java/test/org/broadinstitute/sting/utils/ExpandingArrayListTest.java @@ -0,0 +1,153 @@ +// our package +package org.broadinstitute.sting.utils; + + +// the imports for unit testing. + +import org.junit.Assert; +import org.junit.Test; +import org.junit.Before; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.recalibration.RecalData; +import org.broadinstitute.sting.utils.QualityUtils; +import java.util.HashSet; +import java.util.Arrays; + +/** + * Basic unit test for RecalData + */ +public class ExpandingArrayListTest extends BaseTest { + ExpandingArrayList empty, initCap10, hasOne, hasTen; + + @Before + public void before() { + empty = new ExpandingArrayList(); + + initCap10 = new ExpandingArrayList(10); + + hasOne = new ExpandingArrayList(); + hasOne.add(1); + + hasTen = new ExpandingArrayList(); + hasTen.addAll(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)); + } + + @Test + public void testBasicSizes() { + logger.warn("Executing testBasicSizes"); + + Assert.assertEquals(empty.size(), 0); + Assert.assertEquals(initCap10.size(), 0); + Assert.assertEquals(hasOne.size(), 1); + Assert.assertEquals(hasTen.size(), 10); + } + + @Test + public void testTenElements() { + logger.warn("Executing testTenElements"); + + for ( int i = 0; i < 10; i++ ) { + Assert.assertEquals((int)hasTen.get(i), i+1); + } + } + + @Test + public void testSettingTenElements() { + logger.warn("Executing testSettingTenElements"); + + for ( int i = 0; i < 10; i++ ) { + Assert.assertEquals((int)hasTen.set(i, 2*i), i+1); + } + + Assert.assertEquals(hasTen.size(), 10); + for ( int i = 0; i < 10; i++ ) { + Assert.assertEquals((int)hasTen.get(i), 2*i); + } + } + + @Test + public void testAdd() { + logger.warn("Executing testAdd"); + + Assert.assertEquals(empty.size(), 0); + empty.add(1); + Assert.assertEquals(empty.size(), 1); + Assert.assertEquals((int)empty.get(0), 1); + empty.add(2); + Assert.assertEquals(empty.size(), 2); + Assert.assertEquals((int)empty.get(1), 2); + } + + @Test + public void testSet1() { + logger.warn("Executing testSet1"); + + Assert.assertEquals(empty.size(), 0); + empty.set(0, 1); + Assert.assertEquals(empty.size(), 1); + Assert.assertEquals((int)empty.get(0), 1); + + empty.set(1, 2); + Assert.assertEquals(empty.size(), 2); + Assert.assertEquals((int)empty.get(1), 2); + + // doesn't expand + empty.set(0, 3); + Assert.assertEquals(empty.size(), 2); + Assert.assertEquals((int)empty.get(0), 3); + } + + @Test + public void testSetExpanding() { + logger.warn("Executing testSetExpanding"); + + Assert.assertEquals(empty.size(), 0); + empty.set(3, 1); + Assert.assertEquals(empty.size(), 4); + Assert.assertEquals(empty.get(0), null); + Assert.assertEquals(empty.get(1), null); + Assert.assertEquals(empty.get(2), null); + Assert.assertEquals((int)empty.get(3), 1); + } + + @Test + public void testSetExpandingReset() { + logger.warn("Executing testSetExpandingReset"); + + Assert.assertEquals(empty.size(), 0); + empty.set(3, 3); + empty.set(2, 2); + empty.set(1, 1); + empty.set(0, 0); + Assert.assertEquals(empty.size(), 4); + for ( int i = 0; i < 4; i++ ) + Assert.assertEquals((int)empty.get(i), i); + } + + @Test + public void testSetExpandingBig() { + logger.warn("Executing testSetExpandingBig"); + + Assert.assertEquals(empty.size(), 0); + empty.set(1000, 1000); + Assert.assertEquals(empty.size(), 1001); + for ( int i = 0; i < 1000; i++ ) + Assert.assertEquals(empty.get(i), null); + Assert.assertEquals((int)empty.get(1000), 1000); + } + + @Test (expected=IndexOutOfBoundsException.class ) + public void testSetBadGetNegative() { + logger.warn("Executing testSetBadGetNegative"); + empty.get(-1); + } + + @Test + public void testSetBadGetPost() { + logger.warn("Executing testSetBadGetPost"); + empty.set(1, 1); + Assert.assertEquals(empty.get(0), null); + Assert.assertEquals((int)empty.get(1), 1); + Assert.assertEquals(empty.get(2), null); + } +} \ No newline at end of file diff --git a/python/Gelis2PopSNPs.py b/python/Gelis2PopSNPs.py index 77ccfe1c2..871bf44b4 100755 --- a/python/Gelis2PopSNPs.py +++ b/python/Gelis2PopSNPs.py @@ -16,12 +16,17 @@ def geli2dbsnpFile(geli): return os.path.join(root, flowcellDotlane) + '.dbsnp_matches' def SingleSampleGenotyperCmd(bam, geli, use2bp): - naid = bam.split(".")[0] + naid = os.path.split(bam)[1].split(".")[0] metrics = geli + '.metrics' gatkPath = '/humgen/gsa-scr1/kiran/repositories/Sting/trunk/dist/GenomeAnalysisTK.jar' hapmapChip = '/home/radon01/andrewk/hapmap_1kg/gffs/' + naid + '.gff' + hapmapChipStr = ' '.join(['--hapmap_chip', hapmapChip]) + if not os.path.exists(hapmapChip): + print '*** warning, no hapmap chip resuls for', naid + hapmapChipStr = '' + targetList = '/home/radon01/depristo/work/1kg_pilot_evaluation/data/thousand_genomes_alpha_redesign.targets.interval_list' - cmd = "java -ea -jar " + gatkPath + ' ' + ' '.join(['-T SingleSampleGenotyper', '-I', bam, '-L', targetList, '-R', ref, '-D', '/humgen/gsa-scr1/GATK_Data/dbsnp.rod.out', '--hapmap_chip', hapmapChip, '-calls', geli, '-met', metrics, '-geli -fb -l INFO']) + cmd = "java -ea -jar " + gatkPath + ' ' + ' '.join(['-T SingleSampleGenotyper', '-I', bam, '-L', targetList, '-R', ref, '-D', '/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod', '-metout', metrics, '-varout', geli, '-geli -l INFO ']) + hapmapChipStr return cmd def bams2geli(bams): @@ -34,7 +39,7 @@ def bams2geli(bams): else: if not os.path.exists(geli): cmd = picard_utils.callGenotypesCmd( bam, geli, options = picard_utils.hybridSelectionExtraArgsForCalling()) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry ) + jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry ) return geli, jobid calls = map(call1, bams) return map(lambda x: x[0], calls), map(lambda x: x[1], calls) diff --git a/python/RecalQual.py b/python/RecalQual.py index ae5ffa78a..bf41c377c 100755 --- a/python/RecalQual.py +++ b/python/RecalQual.py @@ -17,7 +17,7 @@ output_root = './' # Location of the resource files distributed with the recalibration tool. # If editing, please end this variable with a trailing slash. -resources='resources/' +resources='recalibratorResources/' #resources='/broad/1KG/DCC_recal/ReadQualityRecalibrator/resources/' import getopt,glob,os,sys @@ -30,17 +30,17 @@ if not os.path.isabs(resources): resources = os.path.abspath(application_path) + '/' + resources # Where does the reference live? -reference_base = resources + 'human_b36_both' +reference_base = resources + 'Homo_sapiens_assembly18' reference = reference_base + '.fasta' reference_dict = reference_base + '.dict' reference_fai = reference_base + '.fasta.fai' # Where does DBSNP live? -dbsnp = resources + 'dbsnp.1kg.rod.out' +dbsnp = resources + 'dbsnp_129_hg18.rod' # Where are the application files required to run the recalibration? -gatk = resources + 'gatk/GenomeAnalysisTK.jar' -#gatk = '/home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar' +#gatk = resources + 'gatk/GenomeAnalysisTK.jar' +gatk = '/home/radon01/depristo/dev/GenomeAnalysisTK_stable/trunk/dist/GenomeAnalysisTK.jar' logistic_regression_script = resources + 'logistic_regression.R' empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R' @@ -146,7 +146,7 @@ def usage(): # Try to parse the given command-line arguments. try: - opts, args = getopt.gnu_getopt(sys.argv[1:],'',['recalibrate','evaluate', 'reprocess', 'dry']) + opts, args = getopt.gnu_getopt(sys.argv[1:],'',['recalibrate','evaluate', 'reprocess', 'dry', 'outputRoot=']) except getopt.GetoptError, err: usage() @@ -168,6 +168,8 @@ for opt,arg in opts: reprocessFiles = True if opt == '--dry': dryRun = True + if opt == '--outputRoot': + output_root = arg # Default to 'recalibrate' unless the user specified only evaluate. do_recalibration = not (evaluate_requested and not recalibrate_requested) diff --git a/python/farm_commands.py b/python/farm_commands.py index acdcf7c3f..883f7776b 100644 --- a/python/farm_commands.py +++ b/python/farm_commands.py @@ -27,20 +27,17 @@ def cmd(cmd_str_from_user, farm_queue=False, output_head=None, just_print_comman cmd_str += " \""+cmd_str_from_user + "\"" print ">>> Farming via "+cmd_str - - #result = 'Job <2542666> is submitted to queue .' - result = subprocess.Popen([cmd_str, ""], shell=True, stdout=subprocess.PIPE).communicate()[0] - - p = re.compile('Job <(\d+)> is submitted to queue') - jobid = p.match(result).group(1) - - return jobid else: cmd_str = cmd_str_from_user print ">>> Executing "+cmd_str if just_print_commands or (globals().has_key("justPrintCommands") and globals().justPrintCommands): return -1 + elif farm_queue: + result = subprocess.Popen([cmd_str, ""], shell=True, stdout=subprocess.PIPE).communicate()[0] + p = re.compile('Job <(\d+)> is submitted to queue') + jobid = p.match(result).group(1) + return jobid else: # Actually execute the command if we're not just in debugging output mode status = os.system(cmd_str) diff --git a/python/gatkConfigParser.py b/python/gatkConfigParser.py new file mode 100755 index 000000000..27892f560 --- /dev/null +++ b/python/gatkConfigParser.py @@ -0,0 +1,93 @@ +# +# GATK configuration parser +# +import ConfigParser +import os.path +import sys + +defaultRequiredOptions = {} +def addRequiredOption(name, type): + defaultRequiredOptions[name] = type + +addRequiredOption('jar', 'input_file') +addRequiredOption('reference', 'input_file') +addRequiredOption('referenceIndex', 'input_file') +addRequiredOption('referenceDict', 'input_file') +addRequiredOption('java', 'file') +addRequiredOption('jvm_args', str) +addRequiredOption('args', str) +addRequiredOption('tmp', 'output_file') + +class gatkConfigParser(ConfigParser.SafeConfigParser): + GATK = 'DEFAULT' + + def __init__(self, configFiles): + ConfigParser.SafeConfigParser.__init__(self) + files = filter(None, configFiles) + print 'Reading configuration file(s):', files + print self.read(files) + self.validateRequiredOptions() + + def validateRequiredOptions(self): + for key, value in defaultRequiredOptions.iteritems(): + self.validateOption(self.GATK, key, value) + + def validateOption(self, section, name, type = str): + v = self.getOption(section, name, type) + print ' => Validated option', name, v + + def getGATKOption(self, name, type = str): + return self.getOption(self.GATK, name, type) + + def getGATKModeOption(self, name, mode, type = str): + return self.getOption(mode, name, type) + + def getOption(self, section, name, typeF = None): + if not self.has_option(section, name): + raise "Option %s not found in section %s" % (name, section) + else: + val = self.get(section, name) + if typeF == 'input_file' or typeF == 'output_file': + path = os.path.abspath(os.path.expanduser(val)) + if typeF == 'input_file': + if not os.path.exists(path): + raise "Input file does not exist", path + if not os.access(path, os.R_OK): + raise "Input file cannot be read", path + if typeF == 'output_file': + if not os.access(path, os.W_OK): + raise "Output file cannot be written", path + return path + elif type(typeF) == str: + return str(val) + elif typeF == None: + return val + else: + return typeF(val) + + def java(self): return self.getOption(self.GATK, 'java') + def jvm_args(self): return self.getOption(self.GATK, 'jvm_args') + def jar(self): return self.getOption(self.GATK, 'jar') + def gatk_args(self): return self.getOption(self.GATK, 'args') + def reference(self): return self.getOption(self.GATK, 'reference') + + def gatkCmd(self, mode): + cmd = ' '.join([self.java(), self.jvm_args(), '-jar', self.jar(), self.gatk_args(), '-R', self.reference()]) + cmd += ' ' + ' '.join(['-T', mode, self.getGATKModeOption('args', mode)]) + return cmd + +import unittest +class TestMergeBAMsUtils(unittest.TestCase): + def setUp(self): + configFile = os.path.join(os.path.split(sys.argv[0])[0] + "/../testdata/defaultGATKConfig.cfg") + self.config = gatkConfigParser(configFile) + + def testValidate(self): + self.config.validateRequiredOptions() + + def testCmd(self): + s = "java -ea -Xmx2048m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -l INFO -L 1:1-10,000,000 -R /home/radon01/depristo/work/humanref/Homo_sapiens_assembly18.fasta -T CountCovariates --CREATE_TRAINING_DATA --MIN_MAPPING_QUALITY 1" + self.assertEquals(self.config.gatkCmd('CountCovariates'), s) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/testdata/defaultGATKConfig.cfg b/testdata/defaultGATKConfig.cfg new file mode 100755 index 000000000..a45525fa2 --- /dev/null +++ b/testdata/defaultGATKConfig.cfg @@ -0,0 +1,23 @@ +[DEFAULT] +jar: ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar +referenceRoot: /home/radon01/depristo/work/humanref/Homo_sapiens_assembly18 +reference: %(referenceRoot)s.fasta +referenceIndex: %(reference)s.fai +referenceDict: %(referenceRoot)s.dict +java: java +jvm_args: -ea -Xmx2048m +gatkData: /humgen/gsa-scr1/GATK_Data +dbsnp: %(gatkData)s/dbsnp_129_hg18.rod +tmp: /tmp/ +args: -l INFO +#args: -l INFO -L chr1:1-1,000,000 + +[CountCovariates] +args: --CREATE_TRAINING_DATA --MIN_MAPPING_QUALITY 1 -D %(dbsnp)s + +[TableRecalibration] +args: -compress 1 + +[R] +Rscript: /broad/tools/apps/R-2.6.0/bin/Rscript +PlotQEmpStated: ~andrewk/recalQual_resources/plot_q_emp_stated_hst.R