Added novel / known split to variant eval. That is, emits all of the standard analyses on SNP partitioned into those known in the provided known db and those novel. Also fixed problem with counting bases within subsets

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1068 348d0f76-0448-11de-a6fe-93d51630548a
2009-06-21 21:27:40 +00:00 · 2009-06-21 21:27:40 +00:00 · 26eb362f52
parent d3f0c51944
commit 26eb362f52
6 changed files with 62 additions and 155 deletions
--- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java
+++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java
@ -266,7 +266,7 @@ public abstract class TraversalEngine {
        final double elapsed = (curTime - startTime) / 1000.0;
        //System.out.printf("Cur = %d, last print = %d, elapsed=%.2f, nRecords=%d, met=%b%n", curTime, lastProgressPrintTime, elapsed, nRecords, maxElapsedIntervalForPrinting(curTime));

-        if (mustPrint || nRecords % N_RECORDS_TO_PRINT == 0 || maxElapsedIntervalForPrinting(curTime)) {
+        if (mustPrint || nRecords == 1 || nRecords % N_RECORDS_TO_PRINT == 0 || maxElapsedIntervalForPrinting(curTime)) {
            this.lastProgressPrintTime = curTime;
            final double secsPer1MReads = (elapsed * 1000000.0) / nRecords;
            if (loc != null)
--- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java
@ -31,16 +31,13 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
    @Argument(fullName="READ_GROUP", shortName="rg", required=false, doc="Only use reads with this read group (@RG)")
    public String READ_GROUP = "none";

-    @Argument(fullName="MAX_READ_GROUPS", shortName="mrg", required=false, doc="Abort if number of read groups in input file exceeeds this count.")
-    public int MAX_READ_GROUPS = 100;
+    //@Argument(fullName="MAX_READ_GROUPS", shortName="mrg", required=false, doc="Abort if number of read groups in input file exceeeds this count.")
+    //public int MAX_READ_GROUPS = 100;

    @Argument(fullName="PLATFORM", shortName="pl", required=false, doc="Only calibrate read groups generated from the given platform (default = * for all platforms)")
    public List<String> platforms = Collections.singletonList("*");
    //public List<String> platforms = Collections.singletonList("ILLUMINA");

-    @Argument(fullName="writeDetailedAnalysisFiles", required=false, doc="If true, we'll write detailed analysis files out at the end of covariate calcuilations")
-    public boolean writeDetailedAnalysisFiles = false;
-
    @Argument(fullName="collapsePos", shortName="collapsePos", required=false, doc="")
    public boolean collapsePos = false;

@ -54,9 +51,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
    long skipped_sites = 0; // number of sites skipped because of a dbSNP entry

    public void initialize() {
-        if( getToolkit().getEngine().getSAMHeader().getReadGroups().size() > MAX_READ_GROUPS )
-            Utils.scareUser("Number of read groups in the specified file exceeds the number that can be processed in a reasonable amount of memory." +
-                            "To override this limit, use the --MAX_READ_GROUPS (-mrg) parameter");
+        //if( getToolkit().getEngine().getSAMHeader().getReadGroups().size() > MAX_READ_GROUPS )
+        //    Utils.scareUser("Number of read groups in the specified file exceeds the number that can be processed in a reasonable amount of memory." +
+        //                    "To override this limit, use the --MAX_READ_GROUPS (-mrg) parameter");

        for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
            if( readGroup.getAttribute("PL") == null )
@ -96,7 +93,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {

                SAMReadGroupRecord readGroup = read.getHeader().getReadGroup((String)read.getAttribute("RG"));
                if ( isSupportedReadGroup(readGroup) &&
-                    //!read.getReadNegativeStrandFlag() &&
                    (READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) &&
                    (read.getMappingQuality() >= MIN_MAPPING_QUALITY)) {
                    int offset = offsets.get(i);
@ -147,8 +143,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {

    public void onTraversalDone(Integer result) {
        printInfo(out);
-        writeTrainingData();
-        writeAnalysisData();
+
+        out.printf("Writing raw recalibration data%n");
+        writeRecalTable();
+        out.printf("...done%n");
+
+        //out.printf("Writing logistic recalibration data%n");
+        //writeLogisticRecalibrationTable();
+        //out.printf("...done%n");
    }

    private void printInfo(PrintStream out) {
@ -161,17 +163,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
        out.printf("# fraction_skipped 1/%.0f%n", (double)counted_sites / skipped_sites);
    }

-
-    private void writeTrainingData() {
-        out.printf("Writing raw recalibration data%n");
-        writeRecalTable();
-        out.printf("...done%n");
-
-        out.printf("Writing logistic recalibration data%n");
-        writeLogisticRecalibrationTable();
-        out.printf("...done%n");        
-    }
-
    private void writeLogisticRecalibrationTable() {
        PrintStream dinuc_out = null;
        try {
@ -218,132 +209,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
         }
     }

-
-    // --------------------------------------------------------------------------------------------------------------
-    //
-    // Analysis data
-    //
-    // --------------------------------------------------------------------------------------------------------------
-    private void writeAnalysisData() {
-        if ( writeDetailedAnalysisFiles ) {
-            for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
-                PrintStream DiffVsCycleFile = null, DiffVsDinucFile = null, EmpObsFile = null;
-                try {
-                    DiffVsCycleFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".quality_difference_v_cycle.csv");
-                    DiffVsDinucFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".quality_difference_v_dinucleotide.csv");
-                    EmpObsFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".empirical_v_reported_quality.csv");
-
-                    qualityEmpiricalObserved(EmpObsFile, readGroup.getReadGroupId());
-                    qualityDiffVsCycle(DiffVsCycleFile, readGroup.getReadGroupId());
-                    qualityDiffVsDinucleotide(DiffVsDinucFile, readGroup.getReadGroupId());
-                } catch (FileNotFoundException e){
-                    throw new RuntimeException("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT, e);
-                } finally {
-                    if ( DiffVsCycleFile != null) DiffVsCycleFile.close();
-                    if ( DiffVsDinucFile != null) DiffVsDinucFile.close();
-                    if ( EmpObsFile != null) EmpObsFile.close();
-                }
-            }
-        }
-    }
-
-    class MeanReportedQuality {
-        double Qn = 0;
-        long n = 0;
-        double sumErrors = 0; // Count of estimated number of errors
-
-        void inc(double Q, long n) {
-            this.n += n;
-            //Qn += Q * n; // wrong calculation that worked did not account for Qs being in log space
-            sumErrors += QualityUtils.qualToErrorProb((byte)Q) * n;
-        }
-
-        double result() {
-            //return Qn / n;
-            return -10 * Math.log10(sumErrors / n);
-            //return QualityUtils.probToQual(1.0 - (sumErrors / n));
-        }
-    }
-
-    public void qualityDiffVsCycle(PrintStream file, final String readGroup) {
-        ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
-        ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
-        file.printf("cycle,Qemp-obs,Qemp,Qobs,B,N%n");
-        RecalData All = new RecalData(0,0,readGroup,"");
-        MeanReportedQuality AllReported = new MeanReportedQuality();
-        RecalDataManager manager = data.get(readGroup);
-        int maxReadLen = manager.getMaxReadLen();
-        for (int c=0; c < maxReadLen+1; c++)  {
-            ByCycle.add(new RecalData(c, -1,readGroup,"-"));
-            ByCycleReportedQ.add(new MeanReportedQuality());
-        }
-
-        for ( RecalData datum: getRecalData(readGroup) ) {
-            ByCycle.get(datum.pos).inc(datum.N, datum.B);
-            ByCycleReportedQ.get(datum.pos).inc(datum.qual, datum.N);
-            All.inc(datum.N, datum.B);
-            AllReported.inc(datum.qual, datum.N);
-        }
-
-        for (int c=0; c < maxReadLen+1; c++) {
-            double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N);
-            double reportedQual = ByCycleReportedQ.get(c).result();
-            file.printf("%d, %f, %f, %f, %d, %d%n", c, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
-        }
-    }
-
-    public void qualityDiffVsDinucleotide(PrintStream file, final String readGroup) {
-        ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
-        ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
-        file.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n");
-        RecalData All = new RecalData(0,0,readGroup,"");
-        MeanReportedQuality AllReported = new MeanReportedQuality();
-        for (int c=0; c < RecalData.NDINUCS; c++) {
-            ByCycle.add(new RecalData(-1, -1,readGroup,RecalData.dinucIndex2bases(c)));
-            ByCycleReportedQ.add(new MeanReportedQuality());
-        }
-
-        for ( RecalData datum: getRecalData(readGroup) ) {
-            int dinucIndex = RecalData.dinucIndex(datum.dinuc); //bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1), false);
-            ByCycle.get(dinucIndex).inc(datum.N, datum.B);
-            ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N);
-            All.inc(datum.N, datum.B);
-            AllReported.inc(datum.qual, datum.N);
-        }
-
-        for (int c=0; c < RecalData.NDINUCS; c++) {
-            double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N);
-            double reportedQual = ByCycleReportedQ.get(c).result();
-            file.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
-        }
-    }
-
-    public void qualityEmpiricalObserved(PrintStream file, final String readGroup) {
-        ArrayList<RecalData> ByQ = new ArrayList<RecalData>();
-        ArrayList<MeanReportedQuality> ByQReportedQ = new ArrayList<MeanReportedQuality>();
-        file.printf("Qrep,Qemp,Qrep_avg,B,N%n");
-        RecalData All = new RecalData(0,0,readGroup,"");
-        MeanReportedQuality AllReported = new MeanReportedQuality();
-        for (int q=0; q<QualityUtils.MAX_QUAL_SCORE+1; q++) {
-            ByQ.add(new RecalData(-1,q,readGroup,"-"));
-            ByQReportedQ.add(new MeanReportedQuality());
-        }
-
-        for ( RecalData datum: getRecalData(readGroup) ){
-            ByQ.get(datum.qual).inc(datum.N, datum.B);
-            ByQReportedQ.get(datum.qual).inc(datum.qual, datum.N);
-            All.inc(datum.N, datum.B);
-            AllReported.inc(datum.qual, datum.N);
-            //out.printf("%2d%6d%3d %2d %s%n", datum.qual, datum.N, datum.pos, datum.qual, datum.dinuc);
-        }
-
-        for (int q=0; q<QualityUtils.MAX_QUAL_SCORE; q++) {
-            double empiricalQual = -10 * Math.log10((double)ByQ.get(q).B / ByQ.get(q).N);
-            file.printf("%3d, %2.2f, %2.2f, %12d, %12d%n", q, empiricalQual, ByQReportedQ.get(q).result(), ByQ.get(q).B, ByQ.get(q).N);
-            //out.printf("%3d,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, dinuc, qual, empiricalQual, qual-empiricalQual, N, B);                                                                                      n
-        }
-    }
-
    public Integer reduceInit() {
        return 0;
    }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/BasicVariantAnalysis.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/BasicVariantAnalysis.java
@ -37,6 +37,14 @@ public abstract class BasicVariantAnalysis implements VariantAnalysis {
    public List<String> done() {
        return new ArrayList<String>();
    }
-    
+
+    /**
+     * No need to finalize the data in general
+     * @param nSites
+     */
+    public void finalize(long nSites) {
+
+    }
+
    public abstract String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context);
 }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantAnalysis.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantAnalysis.java
@ -13,5 +13,6 @@ public interface VariantAnalysis {
    public List<String> getParams();
    public void initialize(VariantEvalWalker master, PrintStream out);
    public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context);
+    public void finalize(long nSites);
    public List<String> done();
 }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantCounter.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantCounter.java
@ -9,7 +9,7 @@ import java.util.List;
 import java.util.ArrayList;

 public class VariantCounter extends BasicVariantAnalysis {
-    int nBasesCovered = 0;
+    long nBasesCovered = 0;
    int nSNPs = 0;

    public VariantCounter() {
@ -17,11 +17,18 @@ public class VariantCounter extends BasicVariantAnalysis {
    }

    public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
-        nBasesCovered++;
        nSNPs += eval == null ? 0 : 1;
        return null;
    }

+    /**
+     * No need to finalize the data in general
+     * @param nSites
+     */
+    public void finalize(long nSites) {
+        nBasesCovered = nSites;
+    }
+
    public List<String> done() {
        List<String> s = new ArrayList<String>();
        s.add(String.format("n bases covered: %d", nBasesCovered));
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java
@ -27,12 +27,18 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {

    String COMMENT_STRING = "";

+    final String knownSNPDBName = "dbSNP";
+
    HashMap<String, ArrayList<VariantAnalysis>> analysisSets;

+    long nSites = 0;
+
    final String ALL_SNPS = "all";
    final String SINGLETON_SNPS = "singletons";
    final String TWOHIT_SNPS = "2plus_hit";
-    final String[] ALL_ANALYSIS_NAMES = { ALL_SNPS, SINGLETON_SNPS, TWOHIT_SNPS };
+    final String KNOWN_SNPS = "known";
+    final String NOVEL_SNPS = "novel";
+    final String[] ALL_ANALYSIS_NAMES = { ALL_SNPS, SINGLETON_SNPS, TWOHIT_SNPS, KNOWN_SNPS, NOVEL_SNPS };

    public void initialize() {
        // setup the path to the analysis
@ -57,13 +63,13 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
        // Add new analyzes here!
        //
        analyses.add(new VariantCounter());
-        analyses.add(new VariantDBCoverage("dbSNP"));
+        analyses.add(new VariantDBCoverage(knownSNPDBName));
        analyses.add(new TransitionTranversionAnalysis());
        analyses.add(new NeighborDistanceAnalysis());
        analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold));
        analyses.add(new ClusterCounterAnalysis());

-        if ( printVariants ) analyses.add(new VariantMatcher("dbSNP"));
+        if ( printVariants ) analyses.add(new VariantMatcher(knownSNPDBName));

        for ( VariantAnalysis analysis : analyses ) {
            analysis.initialize(this, openAnalysisOutputStream(setName, analysis));
@ -100,6 +106,8 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
    }

    public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
+        nSites++;
+
        // Iterate over each analysis, and update it
        AllelicVariant eval = (AllelicVariant)tracker.lookup("eval", null);

@ -107,8 +115,24 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
        if ( eval != null && eval.getVariationConfidence() < minDiscoveryQ )
            eval = null;

+        // update stats about all of the SNPs
        updateAnalysisSet(ALL_SNPS, eval, tracker, ref, context);

+        // update the known / novel set by checking whether the knownSNPDBName track has an entry here
+        if ( eval != null ) {
+            AllelicVariant dbsnp = (AllelicVariant)tracker.lookup(knownSNPDBName, null);
+            String noveltySet = dbsnp == null ? NOVEL_SNPS : KNOWN_SNPS;
+            updateAnalysisSet(noveltySet, eval, tracker, ref, context);
+        }
+
+        if ( eval instanceof SNPCallFromGenotypes ) {
+            SNPCallFromGenotypes call = (SNPCallFromGenotypes)eval;
+            int nVarGenotypes = call.nHetGenotypes() + call.nHomVarGenotypes();
+            //System.out.printf("%d variant genotypes at %s%n", nVarGenotypes, calls);
+            final String s = nVarGenotypes == 1 ? SINGLETON_SNPS : TWOHIT_SNPS;
+            updateAnalysisSet(s, eval, tracker, ref, context);
+        }
+
        if ( eval instanceof SNPCallFromGenotypes ) {
            SNPCallFromGenotypes call = (SNPCallFromGenotypes)eval;
            int nVarGenotypes = call.nHetGenotypes() + call.nHomVarGenotypes();
@ -146,8 +170,10 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
    }

    private void printAnalysisSet( final String analysisSetName ) {
+        out.printf("Writing analysis set %s", analysisSetName);
        Date now = new Date();
        for ( VariantAnalysis analysis : getAnalysisSet(analysisSetName) ) {
+            analysis.finalize(nSites);
            PrintStream stream = analysis.getPrintStream(); // getAnalysisOutputStream(analysisSetName + "." + analysis.getName(), analysis.getParams());
            stream.printf("%s%s%n", COMMENT_STRING, Utils.dupString('-', 78));
            stream.printf("%sAnalysis set       %s%n", COMMENT_STRING, analysisSetName);