diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index daac3f8fe..dbe33dcb2 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -266,7 +266,7 @@ public abstract class TraversalEngine { final double elapsed = (curTime - startTime) / 1000.0; //System.out.printf("Cur = %d, last print = %d, elapsed=%.2f, nRecords=%d, met=%b%n", curTime, lastProgressPrintTime, elapsed, nRecords, maxElapsedIntervalForPrinting(curTime)); - if (mustPrint || nRecords % N_RECORDS_TO_PRINT == 0 || maxElapsedIntervalForPrinting(curTime)) { + if (mustPrint || nRecords == 1 || nRecords % N_RECORDS_TO_PRINT == 0 || maxElapsedIntervalForPrinting(curTime)) { this.lastProgressPrintTime = curTime; final double secsPer1MReads = (elapsed * 1000000.0) / nRecords; if (loc != null) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java index da4675b3a..f3d9e9efc 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -31,16 +31,13 @@ public class CovariateCounterWalker extends LocusWalker { @Argument(fullName="READ_GROUP", shortName="rg", required=false, doc="Only use reads with this read group (@RG)") public String READ_GROUP = "none"; - @Argument(fullName="MAX_READ_GROUPS", shortName="mrg", required=false, doc="Abort if number of read groups in input file exceeeds this count.") - public int MAX_READ_GROUPS = 100; + //@Argument(fullName="MAX_READ_GROUPS", shortName="mrg", required=false, doc="Abort if number of read groups in input file exceeeds this count.") + //public int MAX_READ_GROUPS = 100; @Argument(fullName="PLATFORM", shortName="pl", required=false, doc="Only calibrate read groups generated from the given platform (default = * for all platforms)") public List platforms = Collections.singletonList("*"); //public List platforms = Collections.singletonList("ILLUMINA"); - @Argument(fullName="writeDetailedAnalysisFiles", required=false, doc="If true, we'll write detailed analysis files out at the end of covariate calcuilations") - public boolean writeDetailedAnalysisFiles = false; - @Argument(fullName="collapsePos", shortName="collapsePos", required=false, doc="") public boolean collapsePos = false; @@ -54,9 +51,9 @@ public class CovariateCounterWalker extends LocusWalker { long skipped_sites = 0; // number of sites skipped because of a dbSNP entry public void initialize() { - if( getToolkit().getEngine().getSAMHeader().getReadGroups().size() > MAX_READ_GROUPS ) - Utils.scareUser("Number of read groups in the specified file exceeds the number that can be processed in a reasonable amount of memory." + - "To override this limit, use the --MAX_READ_GROUPS (-mrg) parameter"); + //if( getToolkit().getEngine().getSAMHeader().getReadGroups().size() > MAX_READ_GROUPS ) + // Utils.scareUser("Number of read groups in the specified file exceeds the number that can be processed in a reasonable amount of memory." + + // "To override this limit, use the --MAX_READ_GROUPS (-mrg) parameter"); for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) { if( readGroup.getAttribute("PL") == null ) @@ -96,7 +93,6 @@ public class CovariateCounterWalker extends LocusWalker { SAMReadGroupRecord readGroup = read.getHeader().getReadGroup((String)read.getAttribute("RG")); if ( isSupportedReadGroup(readGroup) && - //!read.getReadNegativeStrandFlag() && (READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) && (read.getMappingQuality() >= MIN_MAPPING_QUALITY)) { int offset = offsets.get(i); @@ -147,8 +143,14 @@ public class CovariateCounterWalker extends LocusWalker { public void onTraversalDone(Integer result) { printInfo(out); - writeTrainingData(); - writeAnalysisData(); + + out.printf("Writing raw recalibration data%n"); + writeRecalTable(); + out.printf("...done%n"); + + //out.printf("Writing logistic recalibration data%n"); + //writeLogisticRecalibrationTable(); + //out.printf("...done%n"); } private void printInfo(PrintStream out) { @@ -161,17 +163,6 @@ public class CovariateCounterWalker extends LocusWalker { out.printf("# fraction_skipped 1/%.0f%n", (double)counted_sites / skipped_sites); } - - private void writeTrainingData() { - out.printf("Writing raw recalibration data%n"); - writeRecalTable(); - out.printf("...done%n"); - - out.printf("Writing logistic recalibration data%n"); - writeLogisticRecalibrationTable(); - out.printf("...done%n"); - } - private void writeLogisticRecalibrationTable() { PrintStream dinuc_out = null; try { @@ -218,132 +209,6 @@ public class CovariateCounterWalker extends LocusWalker { } } - - // -------------------------------------------------------------------------------------------------------------- - // - // Analysis data - // - // -------------------------------------------------------------------------------------------------------------- - private void writeAnalysisData() { - if ( writeDetailedAnalysisFiles ) { - for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) { - PrintStream DiffVsCycleFile = null, DiffVsDinucFile = null, EmpObsFile = null; - try { - DiffVsCycleFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".quality_difference_v_cycle.csv"); - DiffVsDinucFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".quality_difference_v_dinucleotide.csv"); - EmpObsFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".empirical_v_reported_quality.csv"); - - qualityEmpiricalObserved(EmpObsFile, readGroup.getReadGroupId()); - qualityDiffVsCycle(DiffVsCycleFile, readGroup.getReadGroupId()); - qualityDiffVsDinucleotide(DiffVsDinucFile, readGroup.getReadGroupId()); - } catch (FileNotFoundException e){ - throw new RuntimeException("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT, e); - } finally { - if ( DiffVsCycleFile != null) DiffVsCycleFile.close(); - if ( DiffVsDinucFile != null) DiffVsDinucFile.close(); - if ( EmpObsFile != null) EmpObsFile.close(); - } - } - } - } - - class MeanReportedQuality { - double Qn = 0; - long n = 0; - double sumErrors = 0; // Count of estimated number of errors - - void inc(double Q, long n) { - this.n += n; - //Qn += Q * n; // wrong calculation that worked did not account for Qs being in log space - sumErrors += QualityUtils.qualToErrorProb((byte)Q) * n; - } - - double result() { - //return Qn / n; - return -10 * Math.log10(sumErrors / n); - //return QualityUtils.probToQual(1.0 - (sumErrors / n)); - } - } - - public void qualityDiffVsCycle(PrintStream file, final String readGroup) { - ArrayList ByCycle = new ArrayList(); - ArrayList ByCycleReportedQ = new ArrayList(); - file.printf("cycle,Qemp-obs,Qemp,Qobs,B,N%n"); - RecalData All = new RecalData(0,0,readGroup,""); - MeanReportedQuality AllReported = new MeanReportedQuality(); - RecalDataManager manager = data.get(readGroup); - int maxReadLen = manager.getMaxReadLen(); - for (int c=0; c < maxReadLen+1; c++) { - ByCycle.add(new RecalData(c, -1,readGroup,"-")); - ByCycleReportedQ.add(new MeanReportedQuality()); - } - - for ( RecalData datum: getRecalData(readGroup) ) { - ByCycle.get(datum.pos).inc(datum.N, datum.B); - ByCycleReportedQ.get(datum.pos).inc(datum.qual, datum.N); - All.inc(datum.N, datum.B); - AllReported.inc(datum.qual, datum.N); - } - - for (int c=0; c < maxReadLen+1; c++) { - double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N); - double reportedQual = ByCycleReportedQ.get(c).result(); - file.printf("%d, %f, %f, %f, %d, %d%n", c, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N); - } - } - - public void qualityDiffVsDinucleotide(PrintStream file, final String readGroup) { - ArrayList ByCycle = new ArrayList(); - ArrayList ByCycleReportedQ = new ArrayList(); - file.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n"); - RecalData All = new RecalData(0,0,readGroup,""); - MeanReportedQuality AllReported = new MeanReportedQuality(); - for (int c=0; c < RecalData.NDINUCS; c++) { - ByCycle.add(new RecalData(-1, -1,readGroup,RecalData.dinucIndex2bases(c))); - ByCycleReportedQ.add(new MeanReportedQuality()); - } - - for ( RecalData datum: getRecalData(readGroup) ) { - int dinucIndex = RecalData.dinucIndex(datum.dinuc); //bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1), false); - ByCycle.get(dinucIndex).inc(datum.N, datum.B); - ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N); - All.inc(datum.N, datum.B); - AllReported.inc(datum.qual, datum.N); - } - - for (int c=0; c < RecalData.NDINUCS; c++) { - double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N); - double reportedQual = ByCycleReportedQ.get(c).result(); - file.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N); - } - } - - public void qualityEmpiricalObserved(PrintStream file, final String readGroup) { - ArrayList ByQ = new ArrayList(); - ArrayList ByQReportedQ = new ArrayList(); - file.printf("Qrep,Qemp,Qrep_avg,B,N%n"); - RecalData All = new RecalData(0,0,readGroup,""); - MeanReportedQuality AllReported = new MeanReportedQuality(); - for (int q=0; q done() { return new ArrayList(); } - + + /** + * No need to finalize the data in general + * @param nSites + */ + public void finalize(long nSites) { + + } + public abstract String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context); } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantAnalysis.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantAnalysis.java index f6ffbbdd3..378f345a0 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantAnalysis.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantAnalysis.java @@ -13,5 +13,6 @@ public interface VariantAnalysis { public List getParams(); public void initialize(VariantEvalWalker master, PrintStream out); public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context); + public void finalize(long nSites); public List done(); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantCounter.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantCounter.java index 9cc18cb15..b58c197aa 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantCounter.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantCounter.java @@ -9,7 +9,7 @@ import java.util.List; import java.util.ArrayList; public class VariantCounter extends BasicVariantAnalysis { - int nBasesCovered = 0; + long nBasesCovered = 0; int nSNPs = 0; public VariantCounter() { @@ -17,11 +17,18 @@ public class VariantCounter extends BasicVariantAnalysis { } public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) { - nBasesCovered++; nSNPs += eval == null ? 0 : 1; return null; } + /** + * No need to finalize the data in general + * @param nSites + */ + public void finalize(long nSites) { + nBasesCovered = nSites; + } + public List done() { List s = new ArrayList(); s.add(String.format("n bases covered: %d", nBasesCovered)); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java index 74ef78b0d..54ac40de0 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java @@ -27,12 +27,18 @@ public class VariantEvalWalker extends RefWalker { String COMMENT_STRING = ""; + final String knownSNPDBName = "dbSNP"; + HashMap> analysisSets; + long nSites = 0; + final String ALL_SNPS = "all"; final String SINGLETON_SNPS = "singletons"; final String TWOHIT_SNPS = "2plus_hit"; - final String[] ALL_ANALYSIS_NAMES = { ALL_SNPS, SINGLETON_SNPS, TWOHIT_SNPS }; + final String KNOWN_SNPS = "known"; + final String NOVEL_SNPS = "novel"; + final String[] ALL_ANALYSIS_NAMES = { ALL_SNPS, SINGLETON_SNPS, TWOHIT_SNPS, KNOWN_SNPS, NOVEL_SNPS }; public void initialize() { // setup the path to the analysis @@ -57,13 +63,13 @@ public class VariantEvalWalker extends RefWalker { // Add new analyzes here! // analyses.add(new VariantCounter()); - analyses.add(new VariantDBCoverage("dbSNP")); + analyses.add(new VariantDBCoverage(knownSNPDBName)); analyses.add(new TransitionTranversionAnalysis()); analyses.add(new NeighborDistanceAnalysis()); analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold)); analyses.add(new ClusterCounterAnalysis()); - if ( printVariants ) analyses.add(new VariantMatcher("dbSNP")); + if ( printVariants ) analyses.add(new VariantMatcher(knownSNPDBName)); for ( VariantAnalysis analysis : analyses ) { analysis.initialize(this, openAnalysisOutputStream(setName, analysis)); @@ -100,6 +106,8 @@ public class VariantEvalWalker extends RefWalker { } public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) { + nSites++; + // Iterate over each analysis, and update it AllelicVariant eval = (AllelicVariant)tracker.lookup("eval", null); @@ -107,8 +115,24 @@ public class VariantEvalWalker extends RefWalker { if ( eval != null && eval.getVariationConfidence() < minDiscoveryQ ) eval = null; + // update stats about all of the SNPs updateAnalysisSet(ALL_SNPS, eval, tracker, ref, context); + // update the known / novel set by checking whether the knownSNPDBName track has an entry here + if ( eval != null ) { + AllelicVariant dbsnp = (AllelicVariant)tracker.lookup(knownSNPDBName, null); + String noveltySet = dbsnp == null ? NOVEL_SNPS : KNOWN_SNPS; + updateAnalysisSet(noveltySet, eval, tracker, ref, context); + } + + if ( eval instanceof SNPCallFromGenotypes ) { + SNPCallFromGenotypes call = (SNPCallFromGenotypes)eval; + int nVarGenotypes = call.nHetGenotypes() + call.nHomVarGenotypes(); + //System.out.printf("%d variant genotypes at %s%n", nVarGenotypes, calls); + final String s = nVarGenotypes == 1 ? SINGLETON_SNPS : TWOHIT_SNPS; + updateAnalysisSet(s, eval, tracker, ref, context); + } + if ( eval instanceof SNPCallFromGenotypes ) { SNPCallFromGenotypes call = (SNPCallFromGenotypes)eval; int nVarGenotypes = call.nHetGenotypes() + call.nHomVarGenotypes(); @@ -146,8 +170,10 @@ public class VariantEvalWalker extends RefWalker { } private void printAnalysisSet( final String analysisSetName ) { + out.printf("Writing analysis set %s", analysisSetName); Date now = new Date(); for ( VariantAnalysis analysis : getAnalysisSet(analysisSetName) ) { + analysis.finalize(nSites); PrintStream stream = analysis.getPrintStream(); // getAnalysisOutputStream(analysisSetName + "." + analysis.getName(), analysis.getParams()); stream.printf("%s%s%n", COMMENT_STRING, Utils.dupString('-', 78)); stream.printf("%sAnalysis set %s%n", COMMENT_STRING, analysisSetName);