Merged functionality of two python scripts into LogRegression.py, some clarity updates to covariate and regression java files.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@876 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
82aa0533b8
commit
587d07da00
|
|
@ -49,12 +49,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
long counted_sites = 0; // number of sites used to count covariates
|
long counted_sites = 0; // number of sites used to count covariates
|
||||||
long skipped_sites = 0; // number of sites skipped because of a dbSNP entry
|
long skipped_sites = 0; // number of sites skipped because of a dbSNP entry
|
||||||
|
|
||||||
String dinuc_root = "dinuc";
|
|
||||||
ArrayList<PrintStream> dinuc_outs = new ArrayList<PrintStream>();
|
|
||||||
PrintStream ByQualFile; // = new PrintStream("quality_empirical_vs_observed.csv");
|
|
||||||
PrintStream ByCycleFile;
|
|
||||||
PrintStream ByDinucFile;
|
|
||||||
|
|
||||||
private class RecalData {
|
private class RecalData {
|
||||||
long N;
|
long N;
|
||||||
long B;
|
long B;
|
||||||
|
|
@ -104,15 +98,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
|
||||||
ByQualFile = new PrintStream(OUTPUT_FILEROOT+".empirical_v_reported_quality.csv");
|
|
||||||
ByCycleFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_cycle.csv");
|
|
||||||
ByDinucFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_dinucleotide.csv");
|
|
||||||
} catch (FileNotFoundException e){
|
|
||||||
System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT);
|
|
||||||
System.exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||||
|
|
@ -123,7 +109,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
for (int i =0; i < reads.size(); i++ ) {
|
for (int i =0; i < reads.size(); i++ ) {
|
||||||
SAMRecord read = reads.get(i);
|
SAMRecord read = reads.get(i);
|
||||||
|
|
||||||
if ((READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) &&
|
if (//read.getHeader().getReadGroup((String)read.getAttribute("RG")).getAttribute("PL") == "ILLUMINA" &&
|
||||||
|
!read.getReadNegativeStrandFlag() &&
|
||||||
|
(READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) &&
|
||||||
(read.getMappingQuality() >= MIN_MAPPING_QUALITY) &&
|
(read.getMappingQuality() >= MIN_MAPPING_QUALITY) &&
|
||||||
(DOWNSAMPLE_FRACTION == 1.0 || random_genrator.nextFloat() < DOWNSAMPLE_FRACTION)) {
|
(DOWNSAMPLE_FRACTION == 1.0 || random_genrator.nextFloat() < DOWNSAMPLE_FRACTION)) {
|
||||||
//(random_genrator.nextFloat() <= DOWNSAMPLE_FRACTION)
|
//(random_genrator.nextFloat() <= DOWNSAMPLE_FRACTION)
|
||||||
|
|
@ -134,14 +122,15 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
if (qual > 0 && qual <= MAX_QUAL_SCORE) {
|
if (qual > 0 && qual <= MAX_QUAL_SCORE) {
|
||||||
// previous base is the next base in terms of machine chemistry if this is a negative strand
|
// previous base is the next base in terms of machine chemistry if this is a negative strand
|
||||||
char base = (char)read.getReadBases()[offset];
|
char base = (char)read.getReadBases()[offset];
|
||||||
char prevBase = (char)read.getReadBases()[offset + (read.getReadNegativeStrandFlag() ? 1 : -1)];
|
char prevBase = (char)read.getReadBases()[offset -1];
|
||||||
int dinuc_index = bases2dinucIndex(prevBase, base, read.getReadNegativeStrandFlag());
|
int dinuc_index = bases2dinucIndex(prevBase, base, false);
|
||||||
//char prevBase = (char)read.getReadBases()[offset + (read.getReadNegativeStrandFlag() ? 1 : -1)];
|
//char prevBase = (char)read.getReadBases()[offset + (read.getReadNegativeStrandFlag() ? 1 : -1)];
|
||||||
//int dinuc_index = bases2dinucIndex(prevBase, base, read.getReadNegativeStrandFlag());
|
//int dinuc_index = bases2dinucIndex(prevBase, base, read.getReadNegativeStrandFlag());
|
||||||
|
|
||||||
// Convert offset into cycle position which means reversing the position of reads on the negative strand
|
// Convert offset into cycle position which means reversing the position of reads on the negative strand
|
||||||
int cycle = read.getReadNegativeStrandFlag() ? numBases - offset - 1 : offset;
|
//int cycle = read.getReadNegativeStrandFlag() ? numBases - offset - 1 : offset;
|
||||||
data[cycle][qual][dinuc_index].inc(base,ref);
|
//data[cycle][qual][dinuc_index].inc(base,ref);
|
||||||
|
data[offset][qual][dinuc_index].inc(base,ref);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -149,6 +138,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
counted_sites += 1;
|
counted_sites += 1;
|
||||||
}else{
|
}else{
|
||||||
skipped_sites += 1;
|
skipped_sites += 1;
|
||||||
|
//System.out.println(dbsnp.toSimpleString()+" "+new ReadBackedPileup(ref, context).getPileupString());
|
||||||
}
|
}
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
@ -174,19 +164,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
out.printf("Skipped sites: %d%n", skipped_sites);
|
out.printf("Skipped sites: %d%n", skipped_sites);
|
||||||
out.printf("Fraction skipped: 1/%.0f%n", (double)counted_sites / skipped_sites);
|
out.printf("Fraction skipped: 1/%.0f%n", (double)counted_sites / skipped_sites);
|
||||||
|
|
||||||
// Close dinuc filehandles
|
|
||||||
if (CREATE_TRAINING_DATA) writeTrainingData();
|
if (CREATE_TRAINING_DATA) writeTrainingData();
|
||||||
/*if (CREATE_TRAINING_DATA)
|
|
||||||
for ( PrintStream dinuc_stream : this.dinuc_outs)
|
|
||||||
dinuc_stream.close();*/
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void writeTrainingData() {
|
void writeTrainingData() {
|
||||||
for ( int dinuc_index=0; dinuc_index<NDINUCS; dinuc_index++) {
|
for ( int dinuc_index=0; dinuc_index<NDINUCS; dinuc_index++) {
|
||||||
PrintStream dinuc_out = null;
|
PrintStream dinuc_out = null;
|
||||||
try {
|
try {
|
||||||
dinuc_out = new PrintStream( dinuc_root+"."+dinucIndex2bases(dinuc_index)+".csv");
|
dinuc_out = new PrintStream( OUTPUT_FILEROOT+".covariate_counts."+dinucIndex2bases(dinuc_index)+".csv");
|
||||||
dinuc_out.println("logitQ,pos,indicator,count");
|
dinuc_out.println("logitQ,pos,indicator,count");
|
||||||
|
|
||||||
for ( RecalData datum: flattenData ) {
|
for ( RecalData datum: flattenData ) {
|
||||||
|
|
@ -225,6 +210,13 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void qualityDiffVsCycle() {
|
public void qualityDiffVsCycle() {
|
||||||
|
PrintStream ByCycleFile = null;
|
||||||
|
try {
|
||||||
|
ByCycleFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_cycle.csv");
|
||||||
|
} catch (FileNotFoundException e){
|
||||||
|
System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT);
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
|
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
|
||||||
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
|
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
|
||||||
ByCycleFile.printf("cycle,Qemp-obs,Qemp,Qobs,B,N%n");
|
ByCycleFile.printf("cycle,Qemp-obs,Qemp,Qobs,B,N%n");
|
||||||
|
|
@ -252,6 +244,13 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void qualityDiffVsDinucleotide() {
|
public void qualityDiffVsDinucleotide() {
|
||||||
|
PrintStream ByDinucFile = null;
|
||||||
|
try {
|
||||||
|
ByDinucFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_dinucleotide.csv");
|
||||||
|
} catch (FileNotFoundException e){
|
||||||
|
System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT);
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
|
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
|
||||||
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
|
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
|
||||||
ByDinucFile.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n");
|
ByDinucFile.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n");
|
||||||
|
|
@ -280,7 +279,13 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void qualityEmpiricalObserved() {
|
public void qualityEmpiricalObserved() {
|
||||||
|
PrintStream ByQualFile = null;
|
||||||
|
try {
|
||||||
|
ByQualFile = new PrintStream(OUTPUT_FILEROOT+".empirical_v_reported_quality.csv");
|
||||||
|
} catch (FileNotFoundException e){
|
||||||
|
System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT);
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
ArrayList<RecalData> ByQ = new ArrayList<RecalData>();
|
ArrayList<RecalData> ByQ = new ArrayList<RecalData>();
|
||||||
ArrayList<MeanReportedQuality> ByQReportedQ = new ArrayList<MeanReportedQuality>();
|
ArrayList<MeanReportedQuality> ByQReportedQ = new ArrayList<MeanReportedQuality>();
|
||||||
ByQualFile.printf("Qrep,Qemp,Qrep_avg,B,N%n");
|
ByQualFile.printf("Qrep,Qemp,Qrep_avg,B,N%n");
|
||||||
|
|
@ -353,12 +358,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
Random random_genrator;
|
Random random_genrator;
|
||||||
// Print out data for regression
|
// Print out data for regression
|
||||||
public CovariateCounterWalker() throws FileNotFoundException {
|
public CovariateCounterWalker() throws FileNotFoundException {
|
||||||
/*if (CREATE_TRAINING_DATA)
|
|
||||||
for ( int i=0; i<NDINUCS; i++) {
|
|
||||||
PrintStream next_dinuc = new PrintStream( dinuc_root+"."+dinucIndex2bases(i)+".csv");
|
|
||||||
next_dinuc.println("logitQ,pos,indicator");
|
|
||||||
dinuc_outs.add(next_dinuc);
|
|
||||||
} */
|
|
||||||
random_genrator = new Random(123454321); // keep same random seed while debugging
|
random_genrator = new Random(123454321); // keep same random seed while debugging
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -62,7 +62,7 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
|
||||||
ArrayList<Pair<Integer, Integer>> mapping = new ArrayList<Pair<Integer, Integer>>();
|
ArrayList<Pair<Integer, Integer>> mapping = new ArrayList<Pair<Integer, Integer>>();
|
||||||
|
|
||||||
String[] elts = headerLine.split("\\s+");
|
String[] elts = headerLine.split("\\s+");
|
||||||
if ( ! elts[0].toLowerCase().equals("dinuc") )
|
if ( ! elts[0].toLowerCase().startsWith("dinuc") ) // checking only start of first field because dinuc will be followed by a version number to be checekde later
|
||||||
Utils.scareUser("Badly formatted Logistic regression header, upper left keyword must be dinuc: " + elts[0] + " line: " + headerLine);
|
Utils.scareUser("Badly formatted Logistic regression header, upper left keyword must be dinuc: " + elts[0] + " line: " + headerLine);
|
||||||
|
|
||||||
for ( int k = 1; k < elts.length; k++ ) {
|
for ( int k = 1; k < elts.length; k++ ) {
|
||||||
|
|
|
||||||
|
|
@ -1,19 +1,35 @@
|
||||||
|
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import sys
|
import subprocess, time, sys
|
||||||
|
|
||||||
dinuc_root = sys.argv[1]
|
dinucs = ("AA","AC","AG","AT","CA","CC","CG","CT","GA","GC","GG","GT","TA","TC","TG","TT")
|
||||||
fout = file(dinuc_root+".log_reg_params", "w")
|
|
||||||
|
|
||||||
fout.write("dinuc\t")
|
#covar_work_dir = "/humgen/gsa-scr1/andrewk/recalibration_work"
|
||||||
|
#covar_table_dir = "/humgen/gsa-scr1/andrewk/recalibration_tables"
|
||||||
|
fileroot = sys.argv[1]
|
||||||
|
covar_counts_root = fileroot+".covariate_counts"
|
||||||
|
parameter_root = fileroot+".log_reg"
|
||||||
|
recal_table = fileroot+".recalibration_table"
|
||||||
|
|
||||||
|
# Run all R process to do regression and wait for completion
|
||||||
|
kids = []
|
||||||
|
for dinuc in dinucs:
|
||||||
|
kids.append(subprocess.Popen(["/home/radon01/andrewk/covariates/logistic_regression.R", covar_counts_root, parameter_root, dinuc]))
|
||||||
|
#kids.append(subprocess.Popen(["sleep", "8"]))
|
||||||
|
|
||||||
|
while any(kid.poll() is None for kid in kids):
|
||||||
|
time.sleep(0.25)
|
||||||
|
|
||||||
|
fout = file(recal_table, "w")
|
||||||
|
|
||||||
|
fout.write("dinuc_v.1\t")
|
||||||
for p in range(5):
|
for p in range(5):
|
||||||
for q in range(5):
|
for q in range(5):
|
||||||
fout.write("%d,%d\t" % (p,q))
|
fout.write("%d,%d\t" % (p,q))
|
||||||
fout.write("\n")
|
fout.write("\n")
|
||||||
|
|
||||||
for dinuc in ("AA","AC","AG","AT","CA","CC","CG","CT","GA","GC","GG","GT","TA","TC","TG","TT"):
|
for dinuc in dinucs:
|
||||||
dinin = open(dinuc_root+"."+dinuc+".parameters")
|
dinin = open(parameter_root+"."+dinuc+".parameters")
|
||||||
#dinin.readline()
|
#dinin.readline()
|
||||||
params = []
|
params = []
|
||||||
for line in dinin:
|
for line in dinin:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue