Covariate counter now outputs files used by R to do logistic regression.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@527 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4e4fd33584
commit
061f4328b1
|
|
@ -10,6 +10,8 @@ import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
|
||||||
@WalkerName("CountCovariates")
|
@WalkerName("CountCovariates")
|
||||||
public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
|
|
@ -18,7 +20,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
|
|
||||||
@Argument(fullName="MAX_QUAL_SCORE", shortName="mqs", required=false,defaultValue="34")
|
@Argument(fullName="MAX_QUAL_SCORE", shortName="mqs", required=false,defaultValue="34")
|
||||||
public int MAX_QUAL_SCORE;
|
public int MAX_QUAL_SCORE;
|
||||||
//public double MAX_REPORTED_QUAL_SCORE = 100.0;
|
|
||||||
|
@Argument(fullName="LOG_REG_TRAINING_FILEROOT", shortName="lrtf", required=false, defaultValue="dinuc", doc="Filename root for the outputted logistic regression training files")
|
||||||
|
public String LOG_REG_TRAINING_FILEROOT;
|
||||||
|
|
||||||
int NDINUCS = 16;
|
int NDINUCS = 16;
|
||||||
RecalData[][][] data = new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS];
|
RecalData[][][] data = new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS];
|
||||||
|
|
@ -28,6 +32,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
int nuc2num[];
|
int nuc2num[];
|
||||||
char num2nuc[];
|
char num2nuc[];
|
||||||
|
|
||||||
|
String dinuc_root = "dinuc";
|
||||||
|
ArrayList<PrintStream> dinuc_outs = new ArrayList<PrintStream>();
|
||||||
|
PrintStream covars_out = new PrintStream("covars.out");
|
||||||
|
|
||||||
private class RecalData {
|
private class RecalData {
|
||||||
long N;
|
long N;
|
||||||
long B;
|
long B;
|
||||||
|
|
@ -86,8 +94,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
char prevBase = (char)read.getReadBases()[offset-1];
|
char prevBase = (char)read.getReadBases()[offset-1];
|
||||||
int qual = (int)read.getBaseQualities()[offset];
|
int qual = (int)read.getBaseQualities()[offset];
|
||||||
//out.printf("%d %d %d\n", offset, qual, bases2dinucIndex(prevBase,base));
|
//out.printf("%d %d %d\n", offset, qual, bases2dinucIndex(prevBase,base));
|
||||||
RecalData datum = data[offset][qual][bases2dinucIndex(prevBase,base)];
|
int dinuc_index = bases2dinucIndex(prevBase,base);
|
||||||
|
RecalData datum = data[offset][qual][dinuc_index];
|
||||||
datum.inc(base,ref);
|
datum.inc(base,ref);
|
||||||
|
dinuc_outs.get(dinuc_index).format("%d,%d,%d\n", qual, offset, base==ref ? 0 : 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -96,10 +106,15 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
|
|
||||||
public void onTraversalDone(Integer result) {
|
public void onTraversalDone(Integer result) {
|
||||||
if (flattenData.size() > 0)
|
if (flattenData.size() > 0)
|
||||||
out.println(flattenData.get(0).headerString());
|
covars_out.println(flattenData.get(0).headerString());
|
||||||
for ( RecalData datum : flattenData ) {
|
for ( RecalData datum : flattenData ) {
|
||||||
out.println(datum);
|
covars_out.println(datum);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Close dinuc filehandles
|
||||||
|
for ( PrintStream dinuc_stream : this.dinuc_outs)
|
||||||
|
dinuc_stream.close();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Integer reduceInit() {
|
public Integer reduceInit() {
|
||||||
|
|
@ -112,14 +127,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
|
|
||||||
int bases2dinucIndex(char prevBase, char base) {
|
int bases2dinucIndex(char prevBase, char base) {
|
||||||
return nuc2num[prevBase] * 4 + nuc2num[base];
|
return nuc2num[prevBase] * 4 + nuc2num[base];
|
||||||
};
|
}
|
||||||
|
|
||||||
String dinucIndex2bases(int index) {
|
String dinucIndex2bases(int index) {
|
||||||
char data[] = {num2nuc[index / 4], num2nuc[index % 4]};
|
char data[] = {num2nuc[index / 4], num2nuc[index % 4]};
|
||||||
return new String( data );
|
return new String( data );
|
||||||
}
|
}
|
||||||
|
|
||||||
public CovariateCounterWalker() {
|
public CovariateCounterWalker() throws FileNotFoundException {
|
||||||
nuc2num = new int[128];
|
nuc2num = new int[128];
|
||||||
nuc2num['A'] = 0;
|
nuc2num['A'] = 0;
|
||||||
nuc2num['C'] = 1;
|
nuc2num['C'] = 1;
|
||||||
|
|
@ -135,5 +150,17 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
num2nuc[1] = 'C';
|
num2nuc[1] = 'C';
|
||||||
num2nuc[2] = 'G';
|
num2nuc[2] = 'G';
|
||||||
num2nuc[3] = 'T';
|
num2nuc[3] = 'T';
|
||||||
|
|
||||||
|
// Print out data for regression
|
||||||
|
for ( int i=0; i<NDINUCS; i++) {
|
||||||
|
PrintStream next_dinuc = new PrintStream( dinuc_root+"."+dinucIndex2bases(i)+".csv");
|
||||||
|
next_dinuc.println("logitQ,pos,indicator");
|
||||||
|
dinuc_outs.add(next_dinuc);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue