Covariate counter now outputs files used by R to do logistic regression.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@527 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
andrewk 2009-04-24 17:11:57 +00:00
parent 4e4fd33584
commit 061f4328b1
1 changed files with 34 additions and 7 deletions

View File

@ -10,6 +10,8 @@ import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.io.PrintStream;
import java.io.FileNotFoundException;
@WalkerName("CountCovariates") @WalkerName("CountCovariates")
public class CovariateCounterWalker extends LocusWalker<Integer, Integer> { public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
@ -18,7 +20,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
@Argument(fullName="MAX_QUAL_SCORE", shortName="mqs", required=false,defaultValue="34") @Argument(fullName="MAX_QUAL_SCORE", shortName="mqs", required=false,defaultValue="34")
public int MAX_QUAL_SCORE; public int MAX_QUAL_SCORE;
//public double MAX_REPORTED_QUAL_SCORE = 100.0;
@Argument(fullName="LOG_REG_TRAINING_FILEROOT", shortName="lrtf", required=false, defaultValue="dinuc", doc="Filename root for the outputted logistic regression training files")
public String LOG_REG_TRAINING_FILEROOT;
int NDINUCS = 16; int NDINUCS = 16;
RecalData[][][] data = new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS]; RecalData[][][] data = new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS];
@ -28,6 +32,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
int nuc2num[]; int nuc2num[];
char num2nuc[]; char num2nuc[];
String dinuc_root = "dinuc";
ArrayList<PrintStream> dinuc_outs = new ArrayList<PrintStream>();
PrintStream covars_out = new PrintStream("covars.out");
private class RecalData { private class RecalData {
long N; long N;
long B; long B;
@ -86,8 +94,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
char prevBase = (char)read.getReadBases()[offset-1]; char prevBase = (char)read.getReadBases()[offset-1];
int qual = (int)read.getBaseQualities()[offset]; int qual = (int)read.getBaseQualities()[offset];
//out.printf("%d %d %d\n", offset, qual, bases2dinucIndex(prevBase,base)); //out.printf("%d %d %d\n", offset, qual, bases2dinucIndex(prevBase,base));
RecalData datum = data[offset][qual][bases2dinucIndex(prevBase,base)]; int dinuc_index = bases2dinucIndex(prevBase,base);
RecalData datum = data[offset][qual][dinuc_index];
datum.inc(base,ref); datum.inc(base,ref);
dinuc_outs.get(dinuc_index).format("%d,%d,%d\n", qual, offset, base==ref ? 0 : 1);
} }
} }
} }
@ -96,10 +106,15 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
public void onTraversalDone(Integer result) { public void onTraversalDone(Integer result) {
if (flattenData.size() > 0) if (flattenData.size() > 0)
out.println(flattenData.get(0).headerString()); covars_out.println(flattenData.get(0).headerString());
for ( RecalData datum : flattenData ) { for ( RecalData datum : flattenData ) {
out.println(datum); covars_out.println(datum);
} }
// Close dinuc filehandles
for ( PrintStream dinuc_stream : this.dinuc_outs)
dinuc_stream.close();
} }
public Integer reduceInit() { public Integer reduceInit() {
@ -112,14 +127,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
int bases2dinucIndex(char prevBase, char base) { int bases2dinucIndex(char prevBase, char base) {
return nuc2num[prevBase] * 4 + nuc2num[base]; return nuc2num[prevBase] * 4 + nuc2num[base];
}; }
String dinucIndex2bases(int index) { String dinucIndex2bases(int index) {
char data[] = {num2nuc[index / 4], num2nuc[index % 4]}; char data[] = {num2nuc[index / 4], num2nuc[index % 4]};
return new String( data ); return new String( data );
} }
public CovariateCounterWalker() { public CovariateCounterWalker() throws FileNotFoundException {
nuc2num = new int[128]; nuc2num = new int[128];
nuc2num['A'] = 0; nuc2num['A'] = 0;
nuc2num['C'] = 1; nuc2num['C'] = 1;
@ -135,5 +150,17 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
num2nuc[1] = 'C'; num2nuc[1] = 'C';
num2nuc[2] = 'G'; num2nuc[2] = 'G';
num2nuc[3] = 'T'; num2nuc[3] = 'T';
// Print out data for regression
for ( int i=0; i<NDINUCS; i++) {
PrintStream next_dinuc = new PrintStream( dinuc_root+"."+dinucIndex2bases(i)+".csv");
next_dinuc.println("logitQ,pos,indicator");
dinuc_outs.add(next_dinuc);
}
} }
} }