From 587d07da00c41996e41209e9e05320ae5b76ee47 Mon Sep 17 00:00:00 2001 From: andrewk Date: Tue, 2 Jun 2009 16:55:05 +0000 Subject: [PATCH] Merged functionality of two python scripts into LogRegression.py, some clarity updates to covariate and regression java files. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@876 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/CovariateCounterWalker.java | 67 +++++++++---------- .../walkers/LogisticRecalibrationWalker.java | 2 +- python/LogRegression.py | 30 +++++++-- 3 files changed, 57 insertions(+), 42 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java index bbf6d1f6f..7883fe5a1 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java @@ -49,12 +49,6 @@ public class CovariateCounterWalker extends LocusWalker { long counted_sites = 0; // number of sites used to count covariates long skipped_sites = 0; // number of sites skipped because of a dbSNP entry - String dinuc_root = "dinuc"; - ArrayList dinuc_outs = new ArrayList(); - PrintStream ByQualFile; // = new PrintStream("quality_empirical_vs_observed.csv"); - PrintStream ByCycleFile; - PrintStream ByDinucFile; - private class RecalData { long N; long B; @@ -104,15 +98,7 @@ public class CovariateCounterWalker extends LocusWalker { } } - try { - ByQualFile = new PrintStream(OUTPUT_FILEROOT+".empirical_v_reported_quality.csv"); - ByCycleFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_cycle.csv"); - ByDinucFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_dinucleotide.csv"); - } catch (FileNotFoundException e){ - System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT); - System.exit(1); - } - + } public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) { @@ -123,7 +109,9 @@ public class CovariateCounterWalker extends LocusWalker { for (int i =0; i < reads.size(); i++ ) { SAMRecord read = reads.get(i); - if ((READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) && + if (//read.getHeader().getReadGroup((String)read.getAttribute("RG")).getAttribute("PL") == "ILLUMINA" && + !read.getReadNegativeStrandFlag() && + (READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) && (read.getMappingQuality() >= MIN_MAPPING_QUALITY) && (DOWNSAMPLE_FRACTION == 1.0 || random_genrator.nextFloat() < DOWNSAMPLE_FRACTION)) { //(random_genrator.nextFloat() <= DOWNSAMPLE_FRACTION) @@ -134,14 +122,15 @@ public class CovariateCounterWalker extends LocusWalker { if (qual > 0 && qual <= MAX_QUAL_SCORE) { // previous base is the next base in terms of machine chemistry if this is a negative strand char base = (char)read.getReadBases()[offset]; - char prevBase = (char)read.getReadBases()[offset + (read.getReadNegativeStrandFlag() ? 1 : -1)]; - int dinuc_index = bases2dinucIndex(prevBase, base, read.getReadNegativeStrandFlag()); + char prevBase = (char)read.getReadBases()[offset -1]; + int dinuc_index = bases2dinucIndex(prevBase, base, false); //char prevBase = (char)read.getReadBases()[offset + (read.getReadNegativeStrandFlag() ? 1 : -1)]; //int dinuc_index = bases2dinucIndex(prevBase, base, read.getReadNegativeStrandFlag()); // Convert offset into cycle position which means reversing the position of reads on the negative strand - int cycle = read.getReadNegativeStrandFlag() ? numBases - offset - 1 : offset; - data[cycle][qual][dinuc_index].inc(base,ref); + //int cycle = read.getReadNegativeStrandFlag() ? numBases - offset - 1 : offset; + //data[cycle][qual][dinuc_index].inc(base,ref); + data[offset][qual][dinuc_index].inc(base,ref); } } } @@ -149,6 +138,7 @@ public class CovariateCounterWalker extends LocusWalker { counted_sites += 1; }else{ skipped_sites += 1; + //System.out.println(dbsnp.toSimpleString()+" "+new ReadBackedPileup(ref, context).getPileupString()); } return 1; } @@ -174,19 +164,14 @@ public class CovariateCounterWalker extends LocusWalker { out.printf("Skipped sites: %d%n", skipped_sites); out.printf("Fraction skipped: 1/%.0f%n", (double)counted_sites / skipped_sites); - // Close dinuc filehandles if (CREATE_TRAINING_DATA) writeTrainingData(); - /*if (CREATE_TRAINING_DATA) - for ( PrintStream dinuc_stream : this.dinuc_outs) - dinuc_stream.close();*/ - } void writeTrainingData() { for ( int dinuc_index=0; dinuc_index { } public void qualityDiffVsCycle() { + PrintStream ByCycleFile = null; + try { + ByCycleFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_cycle.csv"); + } catch (FileNotFoundException e){ + System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT); + System.exit(1); + } ArrayList ByCycle = new ArrayList(); ArrayList ByCycleReportedQ = new ArrayList(); ByCycleFile.printf("cycle,Qemp-obs,Qemp,Qobs,B,N%n"); @@ -252,6 +244,13 @@ public class CovariateCounterWalker extends LocusWalker { } public void qualityDiffVsDinucleotide() { + PrintStream ByDinucFile = null; + try { + ByDinucFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_dinucleotide.csv"); + } catch (FileNotFoundException e){ + System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT); + System.exit(1); + } ArrayList ByCycle = new ArrayList(); ArrayList ByCycleReportedQ = new ArrayList(); ByDinucFile.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n"); @@ -280,7 +279,13 @@ public class CovariateCounterWalker extends LocusWalker { } public void qualityEmpiricalObserved() { - + PrintStream ByQualFile = null; + try { + ByQualFile = new PrintStream(OUTPUT_FILEROOT+".empirical_v_reported_quality.csv"); + } catch (FileNotFoundException e){ + System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT); + System.exit(1); + } ArrayList ByQ = new ArrayList(); ArrayList ByQReportedQ = new ArrayList(); ByQualFile.printf("Qrep,Qemp,Qrep_avg,B,N%n"); @@ -353,12 +358,6 @@ public class CovariateCounterWalker extends LocusWalker { Random random_genrator; // Print out data for regression public CovariateCounterWalker() throws FileNotFoundException { - /*if (CREATE_TRAINING_DATA) - for ( int i=0; i> mapping = new ArrayList>(); String[] elts = headerLine.split("\\s+"); - if ( ! elts[0].toLowerCase().equals("dinuc") ) + if ( ! elts[0].toLowerCase().startsWith("dinuc") ) // checking only start of first field because dinuc will be followed by a version number to be checekde later Utils.scareUser("Badly formatted Logistic regression header, upper left keyword must be dinuc: " + elts[0] + " line: " + headerLine); for ( int k = 1; k < elts.length; k++ ) { diff --git a/python/LogRegression.py b/python/LogRegression.py index 745c44c9f..49b94b7ef 100755 --- a/python/LogRegression.py +++ b/python/LogRegression.py @@ -1,19 +1,35 @@ - #!/usr/bin/env python -import sys +import subprocess, time, sys -dinuc_root = sys.argv[1] -fout = file(dinuc_root+".log_reg_params", "w") +dinucs = ("AA","AC","AG","AT","CA","CC","CG","CT","GA","GC","GG","GT","TA","TC","TG","TT") -fout.write("dinuc\t") +#covar_work_dir = "/humgen/gsa-scr1/andrewk/recalibration_work" +#covar_table_dir = "/humgen/gsa-scr1/andrewk/recalibration_tables" +fileroot = sys.argv[1] +covar_counts_root = fileroot+".covariate_counts" +parameter_root = fileroot+".log_reg" +recal_table = fileroot+".recalibration_table" + +# Run all R process to do regression and wait for completion +kids = [] +for dinuc in dinucs: + kids.append(subprocess.Popen(["/home/radon01/andrewk/covariates/logistic_regression.R", covar_counts_root, parameter_root, dinuc])) + #kids.append(subprocess.Popen(["sleep", "8"])) + +while any(kid.poll() is None for kid in kids): + time.sleep(0.25) + +fout = file(recal_table, "w") + +fout.write("dinuc_v.1\t") for p in range(5): for q in range(5): fout.write("%d,%d\t" % (p,q)) fout.write("\n") -for dinuc in ("AA","AC","AG","AT","CA","CC","CG","CT","GA","GC","GG","GT","TA","TC","TG","TT"): - dinin = open(dinuc_root+"."+dinuc+".parameters") +for dinuc in dinucs: + dinin = open(parameter_root+"."+dinuc+".parameters") #dinin.readline() params = [] for line in dinin: