From 587d07da00c41996e41209e9e05320ae5b76ee47 Mon Sep 17 00:00:00 2001
From: andrewk <andrewk@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Tue, 2 Jun 2009 16:55:05 +0000
Subject: [PATCH] Merged functionality of two python scripts into
 LogRegression.py, some clarity updates to covariate and regression java
 files.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@876 348d0f76-0448-11de-a6fe-93d51630548a
---
 .../gatk/walkers/CovariateCounterWalker.java  | 67 +++++++++----------
 .../walkers/LogisticRecalibrationWalker.java  |  2 +-
 python/LogRegression.py                       | 30 +++++++--
 3 files changed, 57 insertions(+), 42 deletions(-)
diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java
index bbf6d1f6f..7883fe5a1 100644
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java
@@ -49,12 +49,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
     long counted_sites = 0; // number of sites used to count covariates
     long skipped_sites = 0; // number of sites skipped because of a dbSNP entry
 
-    String dinuc_root = "dinuc";
-    ArrayList<PrintStream> dinuc_outs = new ArrayList<PrintStream>();
-    PrintStream ByQualFile; // = new PrintStream("quality_empirical_vs_observed.csv");
-    PrintStream ByCycleFile;
-    PrintStream ByDinucFile;
-
     private class RecalData {
         long N;
         long B;
@@ -104,15 +98,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
             }
         }
 
-        try {
-            ByQualFile = new PrintStream(OUTPUT_FILEROOT+".empirical_v_reported_quality.csv");
-            ByCycleFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_cycle.csv");
-            ByDinucFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_dinucleotide.csv");
-        } catch (FileNotFoundException e){
-            System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT);
-            System.exit(1);
-        }
-
+        
     }
 
     public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
@@ -123,7 +109,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
             for (int i =0; i < reads.size(); i++ ) {
                 SAMRecord read = reads.get(i);
 
-                if ((READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) &&
+                if (//read.getHeader().getReadGroup((String)read.getAttribute("RG")).getAttribute("PL") == "ILLUMINA" &&
+                    !read.getReadNegativeStrandFlag() &&
+                    (READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) &&
                     (read.getMappingQuality() >= MIN_MAPPING_QUALITY) &&
                     (DOWNSAMPLE_FRACTION == 1.0 || random_genrator.nextFloat() < DOWNSAMPLE_FRACTION)) {
                     //(random_genrator.nextFloat() <= DOWNSAMPLE_FRACTION)
@@ -134,14 +122,15 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
                         if (qual > 0 && qual <= MAX_QUAL_SCORE) {
                             // previous base is the next base in terms of machine chemistry if this is a negative strand
                             char base = (char)read.getReadBases()[offset];
-                            char prevBase = (char)read.getReadBases()[offset + (read.getReadNegativeStrandFlag() ? 1 : -1)];
-                            int dinuc_index = bases2dinucIndex(prevBase, base, read.getReadNegativeStrandFlag());
+                            char prevBase = (char)read.getReadBases()[offset -1];
+                            int dinuc_index = bases2dinucIndex(prevBase, base, false);
                             //char prevBase = (char)read.getReadBases()[offset + (read.getReadNegativeStrandFlag() ? 1 : -1)];
                             //int dinuc_index = bases2dinucIndex(prevBase, base, read.getReadNegativeStrandFlag());
 
                             // Convert offset into cycle position which means reversing the position of reads on the negative strand
-                            int cycle = read.getReadNegativeStrandFlag() ? numBases - offset - 1 : offset;
-                            data[cycle][qual][dinuc_index].inc(base,ref);
+                            //int cycle = read.getReadNegativeStrandFlag() ? numBases - offset - 1 : offset;
+                            //data[cycle][qual][dinuc_index].inc(base,ref);
+                            data[offset][qual][dinuc_index].inc(base,ref);
                         }
                     }
                 }
@@ -149,6 +138,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
             counted_sites += 1;
         }else{
             skipped_sites += 1;
+            //System.out.println(dbsnp.toSimpleString()+" "+new ReadBackedPileup(ref, context).getPileupString());
         }
         return 1;
     }
@@ -174,19 +164,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
         out.printf("Skipped sites: %d%n", skipped_sites);
         out.printf("Fraction skipped: 1/%.0f%n", (double)counted_sites / skipped_sites);
 
-        // Close dinuc filehandles
         if (CREATE_TRAINING_DATA) writeTrainingData();
-        /*if (CREATE_TRAINING_DATA)
-            for ( PrintStream dinuc_stream : this.dinuc_outs)
-                dinuc_stream.close();*/
-
     }
 
     void writeTrainingData() {
         for ( int dinuc_index=0; dinuc_index<NDINUCS; dinuc_index++) {
             PrintStream dinuc_out = null;
             try {
-                dinuc_out = new PrintStream( dinuc_root+"."+dinucIndex2bases(dinuc_index)+".csv");
+                dinuc_out = new PrintStream( OUTPUT_FILEROOT+".covariate_counts."+dinucIndex2bases(dinuc_index)+".csv");
                 dinuc_out.println("logitQ,pos,indicator,count");
 
                 for ( RecalData datum: flattenData ) {
@@ -225,6 +210,13 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
     }
 
     public void qualityDiffVsCycle() {
+        PrintStream ByCycleFile = null;
+        try {
+            ByCycleFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_cycle.csv");
+        } catch (FileNotFoundException e){
+            System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT);
+            System.exit(1);
+        }
         ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
         ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
         ByCycleFile.printf("cycle,Qemp-obs,Qemp,Qobs,B,N%n");
@@ -252,6 +244,13 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
     }
 
     public void qualityDiffVsDinucleotide() {
+        PrintStream ByDinucFile = null;
+        try {
+            ByDinucFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_dinucleotide.csv");
+        } catch (FileNotFoundException e){
+            System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT);
+            System.exit(1);
+        }
         ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
         ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
         ByDinucFile.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n");
@@ -280,7 +279,13 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
     }
 
     public void qualityEmpiricalObserved() {
-
+        PrintStream ByQualFile  = null;
+        try {
+            ByQualFile = new PrintStream(OUTPUT_FILEROOT+".empirical_v_reported_quality.csv");
+        } catch (FileNotFoundException e){
+            System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT);
+            System.exit(1);
+        }
         ArrayList<RecalData> ByQ = new ArrayList<RecalData>();
         ArrayList<MeanReportedQuality> ByQReportedQ = new ArrayList<MeanReportedQuality>();
         ByQualFile.printf("Qrep,Qemp,Qrep_avg,B,N%n");
@@ -353,12 +358,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
     Random random_genrator;
     // Print out data for regression
     public CovariateCounterWalker()  throws FileNotFoundException {
-        /*if (CREATE_TRAINING_DATA)
-            for ( int i=0; i<NDINUCS; i++) {
-                PrintStream next_dinuc = new PrintStream( dinuc_root+"."+dinucIndex2bases(i)+".csv");
-                next_dinuc.println("logitQ,pos,indicator");
-                dinuc_outs.add(next_dinuc);
-            } */
         random_genrator = new Random(123454321); // keep same random seed while debugging
     }
-}
\ No newline at end of file
+}
diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/LogisticRecalibrationWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/LogisticRecalibrationWalker.java
index 0b583a9cf..0c7344366 100755
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/LogisticRecalibrationWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/LogisticRecalibrationWalker.java
@@ -62,7 +62,7 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
         ArrayList<Pair<Integer, Integer>> mapping = new ArrayList<Pair<Integer, Integer>>();
 
         String[] elts = headerLine.split("\\s+");
-        if ( ! elts[0].toLowerCase().equals("dinuc") )
+        if ( ! elts[0].toLowerCase().startsWith("dinuc") ) // checking only start of first field because dinuc will be followed by a version number to be checekde later
             Utils.scareUser("Badly formatted Logistic regression header, upper left keyword must be dinuc: " + elts[0] + " line: " + headerLine);
 
         for ( int k = 1; k < elts.length; k++ ) {
diff --git a/python/LogRegression.py b/python/LogRegression.py
index 745c44c9f..49b94b7ef 100755
--- a/python/LogRegression.py
+++ b/python/LogRegression.py
@@ -1,19 +1,35 @@
-
 #!/usr/bin/env python
 
-import sys
+import subprocess, time, sys
 
-dinuc_root = sys.argv[1]
-fout = file(dinuc_root+".log_reg_params", "w")
+dinucs = ("AA","AC","AG","AT","CA","CC","CG","CT","GA","GC","GG","GT","TA","TC","TG","TT")
 
-fout.write("dinuc\t")
+#covar_work_dir = "/humgen/gsa-scr1/andrewk/recalibration_work"
+#covar_table_dir = "/humgen/gsa-scr1/andrewk/recalibration_tables"
+fileroot = sys.argv[1]
+covar_counts_root = fileroot+".covariate_counts"
+parameter_root = fileroot+".log_reg"
+recal_table = fileroot+".recalibration_table"
+
+# Run all R process to do regression and wait for completion
+kids = []
+for dinuc in dinucs:
+    kids.append(subprocess.Popen(["/home/radon01/andrewk/covariates/logistic_regression.R", covar_counts_root, parameter_root, dinuc]))
+    #kids.append(subprocess.Popen(["sleep", "8"]))
+
+while any(kid.poll() is None for kid in kids):
+    time.sleep(0.25)
+
+fout = file(recal_table, "w")
+
+fout.write("dinuc_v.1\t")
 for p in range(5):
     for q in range(5):
         fout.write("%d,%d\t" % (p,q))
 fout.write("\n")
 
-for dinuc in ("AA","AC","AG","AT","CA","CC","CG","CT","GA","GC","GG","GT","TA","TC","TG","TT"):
-    dinin = open(dinuc_root+"."+dinuc+".parameters")
+for dinuc in dinucs:
+    dinin = open(parameter_root+"."+dinuc+".parameters")
     #dinin.readline()
     params = []
     for line in dinin: