From 26eb362f52b9f4c3205b911885fc2e4ab981b7ed Mon Sep 17 00:00:00 2001
From: depristo <depristo@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Sun, 21 Jun 2009 21:27:40 +0000
Subject: [PATCH] Added novel / known split to variant eval.  That is, emits
 all of the standard analyses on SNP partitioned into those known in the
 provided known db and those novel.  Also fixed problem with counting bases
 within subsets

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1068 348d0f76-0448-11de-a6fe-93d51630548a
---
 .../gatk/traversals/TraversalEngine.java      |   2 +-
 .../recalibration/CovariateCounterWalker.java | 161 ++----------------
 .../varianteval/BasicVariantAnalysis.java     |  10 +-
 .../walkers/varianteval/VariantAnalysis.java  |   1 +
 .../walkers/varianteval/VariantCounter.java   |  11 +-
 .../varianteval/VariantEvalWalker.java        |  32 +++-
 6 files changed, 62 insertions(+), 155 deletions(-)
diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java
index daac3f8fe..dbe33dcb2 100755
--- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java
+++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java
@@ -266,7 +266,7 @@ public abstract class TraversalEngine {
         final double elapsed = (curTime - startTime) / 1000.0;
         //System.out.printf("Cur = %d, last print = %d, elapsed=%.2f, nRecords=%d, met=%b%n", curTime, lastProgressPrintTime, elapsed, nRecords, maxElapsedIntervalForPrinting(curTime));
 
-        if (mustPrint || nRecords % N_RECORDS_TO_PRINT == 0 || maxElapsedIntervalForPrinting(curTime)) {
+        if (mustPrint || nRecords == 1 || nRecords % N_RECORDS_TO_PRINT == 0 || maxElapsedIntervalForPrinting(curTime)) {
             this.lastProgressPrintTime = curTime;
             final double secsPer1MReads = (elapsed * 1000000.0) / nRecords;
             if (loc != null)
diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java
index da4675b3a..f3d9e9efc 100644
--- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java
@@ -31,16 +31,13 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
     @Argument(fullName="READ_GROUP", shortName="rg", required=false, doc="Only use reads with this read group (@RG)")
     public String READ_GROUP = "none";
 
-    @Argument(fullName="MAX_READ_GROUPS", shortName="mrg", required=false, doc="Abort if number of read groups in input file exceeeds this count.")
-    public int MAX_READ_GROUPS = 100;
+    //@Argument(fullName="MAX_READ_GROUPS", shortName="mrg", required=false, doc="Abort if number of read groups in input file exceeeds this count.")
+    //public int MAX_READ_GROUPS = 100;
 
     @Argument(fullName="PLATFORM", shortName="pl", required=false, doc="Only calibrate read groups generated from the given platform (default = * for all platforms)")
     public List<String> platforms = Collections.singletonList("*");
     //public List<String> platforms = Collections.singletonList("ILLUMINA");
 
-    @Argument(fullName="writeDetailedAnalysisFiles", required=false, doc="If true, we'll write detailed analysis files out at the end of covariate calcuilations")
-    public boolean writeDetailedAnalysisFiles = false;
-
     @Argument(fullName="collapsePos", shortName="collapsePos", required=false, doc="")
     public boolean collapsePos = false;
 
@@ -54,9 +51,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
     long skipped_sites = 0; // number of sites skipped because of a dbSNP entry
 
     public void initialize() {
-        if( getToolkit().getEngine().getSAMHeader().getReadGroups().size() > MAX_READ_GROUPS )
-            Utils.scareUser("Number of read groups in the specified file exceeds the number that can be processed in a reasonable amount of memory." +
-                            "To override this limit, use the --MAX_READ_GROUPS (-mrg) parameter");
+        //if( getToolkit().getEngine().getSAMHeader().getReadGroups().size() > MAX_READ_GROUPS )
+        //    Utils.scareUser("Number of read groups in the specified file exceeds the number that can be processed in a reasonable amount of memory." +
+        //                    "To override this limit, use the --MAX_READ_GROUPS (-mrg) parameter");
 
         for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
             if( readGroup.getAttribute("PL") == null )
@@ -96,7 +93,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
 
                 SAMReadGroupRecord readGroup = read.getHeader().getReadGroup((String)read.getAttribute("RG"));
                 if ( isSupportedReadGroup(readGroup) &&
-                    //!read.getReadNegativeStrandFlag() &&
                     (READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) &&
                     (read.getMappingQuality() >= MIN_MAPPING_QUALITY)) {
                     int offset = offsets.get(i);
@@ -147,8 +143,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
 
     public void onTraversalDone(Integer result) {
         printInfo(out);
-        writeTrainingData();
-        writeAnalysisData();
+
+        out.printf("Writing raw recalibration data%n");
+        writeRecalTable();
+        out.printf("...done%n");
+
+        //out.printf("Writing logistic recalibration data%n");
+        //writeLogisticRecalibrationTable();
+        //out.printf("...done%n");
     }
 
     private void printInfo(PrintStream out) {
@@ -161,17 +163,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
         out.printf("# fraction_skipped 1/%.0f%n", (double)counted_sites / skipped_sites);
     }
 
-
-    private void writeTrainingData() {
-        out.printf("Writing raw recalibration data%n");
-        writeRecalTable();
-        out.printf("...done%n");
-
-        out.printf("Writing logistic recalibration data%n");
-        writeLogisticRecalibrationTable();
-        out.printf("...done%n");        
-    }
-
     private void writeLogisticRecalibrationTable() {
         PrintStream dinuc_out = null;
         try {
@@ -218,132 +209,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
          }
      }
 
-
-    // --------------------------------------------------------------------------------------------------------------
-    //
-    // Analysis data
-    //
-    // --------------------------------------------------------------------------------------------------------------
-    private void writeAnalysisData() {
-        if ( writeDetailedAnalysisFiles ) {
-            for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
-                PrintStream DiffVsCycleFile = null, DiffVsDinucFile = null, EmpObsFile = null;
-                try {
-                    DiffVsCycleFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".quality_difference_v_cycle.csv");
-                    DiffVsDinucFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".quality_difference_v_dinucleotide.csv");
-                    EmpObsFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".empirical_v_reported_quality.csv");
-
-                    qualityEmpiricalObserved(EmpObsFile, readGroup.getReadGroupId());
-                    qualityDiffVsCycle(DiffVsCycleFile, readGroup.getReadGroupId());
-                    qualityDiffVsDinucleotide(DiffVsDinucFile, readGroup.getReadGroupId());
-                } catch (FileNotFoundException e){
-                    throw new RuntimeException("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT, e);
-                } finally {
-                    if ( DiffVsCycleFile != null) DiffVsCycleFile.close();
-                    if ( DiffVsDinucFile != null) DiffVsDinucFile.close();
-                    if ( EmpObsFile != null) EmpObsFile.close();
-                }
-            }
-        }
-    }
-
-    class MeanReportedQuality {
-        double Qn = 0;
-        long n = 0;
-        double sumErrors = 0; // Count of estimated number of errors
-
-        void inc(double Q, long n) {
-            this.n += n;
-            //Qn += Q * n; // wrong calculation that worked did not account for Qs being in log space
-            sumErrors += QualityUtils.qualToErrorProb((byte)Q) * n;
-        }
-
-        double result() {
-            //return Qn / n;
-            return -10 * Math.log10(sumErrors / n);
-            //return QualityUtils.probToQual(1.0 - (sumErrors / n));
-        }
-    }
-
-    public void qualityDiffVsCycle(PrintStream file, final String readGroup) {
-        ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
-        ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
-        file.printf("cycle,Qemp-obs,Qemp,Qobs,B,N%n");
-        RecalData All = new RecalData(0,0,readGroup,"");
-        MeanReportedQuality AllReported = new MeanReportedQuality();
-        RecalDataManager manager = data.get(readGroup);
-        int maxReadLen = manager.getMaxReadLen();
-        for (int c=0; c < maxReadLen+1; c++)  {
-            ByCycle.add(new RecalData(c, -1,readGroup,"-"));
-            ByCycleReportedQ.add(new MeanReportedQuality());
-        }
-
-        for ( RecalData datum: getRecalData(readGroup) ) {
-            ByCycle.get(datum.pos).inc(datum.N, datum.B);
-            ByCycleReportedQ.get(datum.pos).inc(datum.qual, datum.N);
-            All.inc(datum.N, datum.B);
-            AllReported.inc(datum.qual, datum.N);
-        }
-
-        for (int c=0; c < maxReadLen+1; c++) {
-            double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N);
-            double reportedQual = ByCycleReportedQ.get(c).result();
-            file.printf("%d, %f, %f, %f, %d, %d%n", c, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
-        }
-    }
-
-    public void qualityDiffVsDinucleotide(PrintStream file, final String readGroup) {
-        ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
-        ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
-        file.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n");
-        RecalData All = new RecalData(0,0,readGroup,"");
-        MeanReportedQuality AllReported = new MeanReportedQuality();
-        for (int c=0; c < RecalData.NDINUCS; c++) {
-            ByCycle.add(new RecalData(-1, -1,readGroup,RecalData.dinucIndex2bases(c)));
-            ByCycleReportedQ.add(new MeanReportedQuality());
-        }
-
-        for ( RecalData datum: getRecalData(readGroup) ) {
-            int dinucIndex = RecalData.dinucIndex(datum.dinuc); //bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1), false);
-            ByCycle.get(dinucIndex).inc(datum.N, datum.B);
-            ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N);
-            All.inc(datum.N, datum.B);
-            AllReported.inc(datum.qual, datum.N);
-        }
-
-        for (int c=0; c < RecalData.NDINUCS; c++) {
-            double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N);
-            double reportedQual = ByCycleReportedQ.get(c).result();
-            file.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
-        }
-    }
-
-    public void qualityEmpiricalObserved(PrintStream file, final String readGroup) {
-        ArrayList<RecalData> ByQ = new ArrayList<RecalData>();
-        ArrayList<MeanReportedQuality> ByQReportedQ = new ArrayList<MeanReportedQuality>();
-        file.printf("Qrep,Qemp,Qrep_avg,B,N%n");
-        RecalData All = new RecalData(0,0,readGroup,"");
-        MeanReportedQuality AllReported = new MeanReportedQuality();
-        for (int q=0; q<QualityUtils.MAX_QUAL_SCORE+1; q++) {
-            ByQ.add(new RecalData(-1,q,readGroup,"-"));
-            ByQReportedQ.add(new MeanReportedQuality());
-        }
-
-        for ( RecalData datum: getRecalData(readGroup) ){
-            ByQ.get(datum.qual).inc(datum.N, datum.B);
-            ByQReportedQ.get(datum.qual).inc(datum.qual, datum.N);
-            All.inc(datum.N, datum.B);
-            AllReported.inc(datum.qual, datum.N);
-            //out.printf("%2d%6d%3d %2d %s%n", datum.qual, datum.N, datum.pos, datum.qual, datum.dinuc);
-        }
-
-        for (int q=0; q<QualityUtils.MAX_QUAL_SCORE; q++) {
-            double empiricalQual = -10 * Math.log10((double)ByQ.get(q).B / ByQ.get(q).N);
-            file.printf("%3d, %2.2f, %2.2f, %12d, %12d%n", q, empiricalQual, ByQReportedQ.get(q).result(), ByQ.get(q).B, ByQ.get(q).N);
-            //out.printf("%3d,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, dinuc, qual, empiricalQual, qual-empiricalQual, N, B);                                                                                      n
-        }
-    }
-
     public Integer reduceInit() {
         return 0;
     }
diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/BasicVariantAnalysis.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/BasicVariantAnalysis.java
index 99e27399d..8c6b90d06 100755
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/BasicVariantAnalysis.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/BasicVariantAnalysis.java
@@ -37,6 +37,14 @@ public abstract class BasicVariantAnalysis implements VariantAnalysis {
     public List<String> done() {
         return new ArrayList<String>();
     }
-    
+
+    /**
+     * No need to finalize the data in general
+     * @param nSites
+     */
+    public void finalize(long nSites) {
+
+    }
+
     public abstract String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context);
 }
\ No newline at end of file
diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantAnalysis.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantAnalysis.java
index f6ffbbdd3..378f345a0 100755
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantAnalysis.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantAnalysis.java
@@ -13,5 +13,6 @@ public interface VariantAnalysis {
     public List<String> getParams();
     public void initialize(VariantEvalWalker master, PrintStream out);
     public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context);
+    public void finalize(long nSites);
     public List<String> done();
 }
diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantCounter.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantCounter.java
index 9cc18cb15..b58c197aa 100755
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantCounter.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantCounter.java
@@ -9,7 +9,7 @@ import java.util.List;
 import java.util.ArrayList;
 
 public class VariantCounter extends BasicVariantAnalysis {
-    int nBasesCovered = 0;
+    long nBasesCovered = 0;
     int nSNPs = 0;
 
     public VariantCounter() {
@@ -17,11 +17,18 @@ public class VariantCounter extends BasicVariantAnalysis {
     }
 
     public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
-        nBasesCovered++;
         nSNPs += eval == null ? 0 : 1;
         return null;
     }
 
+    /**
+     * No need to finalize the data in general
+     * @param nSites
+     */
+    public void finalize(long nSites) {
+        nBasesCovered = nSites;
+    }
+
     public List<String> done() {
         List<String> s = new ArrayList<String>();
         s.add(String.format("n bases covered: %d", nBasesCovered));
diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java
index 74ef78b0d..54ac40de0 100755
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java
@@ -27,12 +27,18 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
 
     String COMMENT_STRING = "";
 
+    final String knownSNPDBName = "dbSNP";
+
     HashMap<String, ArrayList<VariantAnalysis>> analysisSets;
 
+    long nSites = 0;
+
     final String ALL_SNPS = "all";
     final String SINGLETON_SNPS = "singletons";
     final String TWOHIT_SNPS = "2plus_hit";
-    final String[] ALL_ANALYSIS_NAMES = { ALL_SNPS, SINGLETON_SNPS, TWOHIT_SNPS };
+    final String KNOWN_SNPS = "known";
+    final String NOVEL_SNPS = "novel";
+    final String[] ALL_ANALYSIS_NAMES = { ALL_SNPS, SINGLETON_SNPS, TWOHIT_SNPS, KNOWN_SNPS, NOVEL_SNPS };
 
     public void initialize() {
         // setup the path to the analysis
@@ -57,13 +63,13 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
         // Add new analyzes here!
         //
         analyses.add(new VariantCounter());
-        analyses.add(new VariantDBCoverage("dbSNP"));
+        analyses.add(new VariantDBCoverage(knownSNPDBName));
         analyses.add(new TransitionTranversionAnalysis());
         analyses.add(new NeighborDistanceAnalysis());
         analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold));
         analyses.add(new ClusterCounterAnalysis());
 
-        if ( printVariants ) analyses.add(new VariantMatcher("dbSNP"));
+        if ( printVariants ) analyses.add(new VariantMatcher(knownSNPDBName));
 
         for ( VariantAnalysis analysis : analyses ) {
             analysis.initialize(this, openAnalysisOutputStream(setName, analysis));
@@ -100,6 +106,8 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
     }
 
     public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
+        nSites++;
+
         // Iterate over each analysis, and update it
         AllelicVariant eval = (AllelicVariant)tracker.lookup("eval", null);
 
@@ -107,8 +115,24 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
         if ( eval != null && eval.getVariationConfidence() < minDiscoveryQ )
             eval = null;
 
+        // update stats about all of the SNPs
         updateAnalysisSet(ALL_SNPS, eval, tracker, ref, context);
 
+        // update the known / novel set by checking whether the knownSNPDBName track has an entry here
+        if ( eval != null ) {
+            AllelicVariant dbsnp = (AllelicVariant)tracker.lookup(knownSNPDBName, null);
+            String noveltySet = dbsnp == null ? NOVEL_SNPS : KNOWN_SNPS;
+            updateAnalysisSet(noveltySet, eval, tracker, ref, context);
+        }
+
+        if ( eval instanceof SNPCallFromGenotypes ) {
+            SNPCallFromGenotypes call = (SNPCallFromGenotypes)eval;
+            int nVarGenotypes = call.nHetGenotypes() + call.nHomVarGenotypes();
+            //System.out.printf("%d variant genotypes at %s%n", nVarGenotypes, calls);
+            final String s = nVarGenotypes == 1 ? SINGLETON_SNPS : TWOHIT_SNPS;
+            updateAnalysisSet(s, eval, tracker, ref, context);
+        }
+
         if ( eval instanceof SNPCallFromGenotypes ) {
             SNPCallFromGenotypes call = (SNPCallFromGenotypes)eval;
             int nVarGenotypes = call.nHetGenotypes() + call.nHomVarGenotypes();
@@ -146,8 +170,10 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
     }
 
     private void printAnalysisSet( final String analysisSetName ) {
+        out.printf("Writing analysis set %s", analysisSetName);
         Date now = new Date();
         for ( VariantAnalysis analysis : getAnalysisSet(analysisSetName) ) {
+            analysis.finalize(nSites);
             PrintStream stream = analysis.getPrintStream(); // getAnalysisOutputStream(analysisSetName + "." + analysis.getName(), analysis.getParams());
             stream.printf("%s%s%n", COMMENT_STRING, Utils.dupString('-', 78));
             stream.printf("%sAnalysis set       %s%n", COMMENT_STRING, analysisSetName);