Basic logistic regression support for calibrating qualities; mostly for Andrew to experiment with

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@529 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-04-24 19:09:50 +00:00
parent 38c2f73457
commit 40a2b3eeb3
6 changed files with 256 additions and 17 deletions

View File

@ -29,8 +29,8 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
//RecalData[][][] data = new RecalData;
ArrayList<RecalData> flattenData = new ArrayList();
int nuc2num[];
char num2nuc[];
static int nuc2num[];
static char num2nuc[];
String dinuc_root = "dinuc";
ArrayList<PrintStream> dinuc_outs = new ArrayList<PrintStream>();
@ -125,16 +125,16 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
return 0;
}
int bases2dinucIndex(char prevBase, char base) {
public int bases2dinucIndex(char prevBase, char base) {
return nuc2num[prevBase] * 4 + nuc2num[base];
}
String dinucIndex2bases(int index) {
public String dinucIndex2bases(int index) {
char data[] = {num2nuc[index / 4], num2nuc[index % 4]};
return new String( data );
}
public CovariateCounterWalker() throws FileNotFoundException {
static {
nuc2num = new int[128];
nuc2num['A'] = 0;
nuc2num['C'] = 1;
@ -150,17 +150,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
num2nuc[1] = 'C';
num2nuc[2] = 'G';
num2nuc[3] = 'T';
}
// Print out data for regression
// Print out data for regression
public CovariateCounterWalker() throws FileNotFoundException {
for ( int i=0; i<NDINUCS; i++) {
PrintStream next_dinuc = new PrintStream( dinuc_root+"."+dinucIndex2bases(i)+".csv");
next_dinuc.println("logitQ,pos,indicator");
dinuc_outs.add(next_dinuc);
}
}
}
}

View File

@ -0,0 +1,157 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import net.sf.samtools.*;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.*;
import org.apache.log4j.Logger;
import java.util.*;
import java.io.File;
import java.io.FileNotFoundException;
@WalkerName("LogisticRecalibration")
public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
@Argument(shortName="logisticParams", required=true)
public String logisticParamsFile;
@Argument(shortName="outputBAM", required=false, defaultValue="")
public String outputFile;
HashMap<String, LogisticRegressor> regressors = new HashMap<String, LogisticRegressor>();
private static Logger logger = Logger.getLogger(LogisticRecalibrationWalker.class);
public void initialize() {
try {
List<String> lines = new xReadLines(new File(logisticParamsFile)).readLines();
ArrayList<Pair<Integer, Integer>> mapping = parseHeader(lines.get(0));
for ( String line : lines.subList(1,lines.size()) ) {
// dinuc coeff1 coeff2 ... coeff25
String[] vals = line.split("\\s+");
String dinuc = vals[0];
LogisticRegressor regressor = new LogisticRegressor(2, 4);
regressors.put(dinuc, regressor);
System.out.printf("Vals = %s%n", Utils.join(",", vals));
for ( int i = 1; i < vals.length; i++ ) {
Pair<Integer, Integer> ij = mapping.get(i-1);
try {
double c = Double.parseDouble(vals[i]);
regressor.setCoefficient(ij.first, ij.second, c);
System.out.printf("Setting coefficient %s => %s = %f%n", dinuc, ij, c);
} catch ( NumberFormatException e ) {
Utils.scareUser("Badly formed logistic regression header at " + vals[i] + " line: " + line );
}
}
}
for ( Map.Entry<String, LogisticRegressor> e : regressors.entrySet() ) {
String dinuc = e.getKey();
LogisticRegressor regressor = e.getValue();
logger.debug(String.format("Regressor: %s => %s", dinuc, regressor));
}
//System.exit(1);
} catch ( FileNotFoundException e ) {
Utils.scareUser("Cannot read/find logistic parameters file " + logisticParamsFile);
}
}
// damn I hate parsing lines
private ArrayList<Pair<Integer, Integer>> parseHeader(String headerLine) {
ArrayList<Pair<Integer, Integer>> mapping = new ArrayList<Pair<Integer, Integer>>();
String[] elts = headerLine.split("\\s+");
if ( ! elts[0].toLowerCase().equals("dinuc") )
Utils.scareUser("Badly formatted Logistic regression header, upper left keyword must be dinuc: " + elts[0] + " line: " + headerLine);
for ( int k = 1; k < elts.length; k++ ) {
String paramStr = elts[k];
String[] ij = paramStr.split(",");
if ( ij.length != 2 ) {
Utils.scareUser("Badly formed logistic regression header at " + paramStr + " line: " + headerLine);
} else {
try {
int i = Integer.parseInt(ij[0]);
int j = Integer.parseInt(ij[1]);
mapping.add(new Pair<Integer, Integer>(i,j));
logger.info(String.format("%d => %d/%d", k, i, j));
} catch ( NumberFormatException e ) {
Utils.scareUser("Badly formed logistic regression header at " + paramStr + " line: " + headerLine );
}
}
}
return mapping;
}
public SAMRecord map(LocusContext context, SAMRecord read) {
SAMRecord recalRead = read;
byte[] bases = read.getReadBases();
byte[] quals = read.getBaseQualities();
byte[] recalQuals = new byte[quals.length];
recalQuals[0] = quals[0]; // can't change the first -- no dinuc
for ( int i = 1; i < bases.length; i++ ) {
int cycle = i;
byte qual = quals[i];
String dinuc = String.format("%c%c", bases[i-1], bases[i]);
//System.out.printf("dinuc %c %c%n", bases[i-1], bases[i]);
LogisticRegressor regressor = regressors.get(dinuc);
double P = -1;
byte newQual = qual;
if ( regressor != null ) { // no N or some other unexpected bp in the stream
double logPOver1minusP = regressor.regress((double)cycle+1, (double)qual);
double POver1minusP = Math.pow(10, logPOver1minusP);
P = POver1minusP / (1 + POver1minusP);
newQual = QualityUtils.probToQual(P);
//newQual = (byte)Math.min(Math.round(newQualDouble),63);
System.out.printf("Recal %s %d %d => %f => %f => %f leads to %d%n", dinuc, cycle, qual, logPOver1minusP, POver1minusP, P, newQual);
}
recalQuals[i] = newQual;
}
System.out.printf("OLD: %s%n", read.format());
read.setBaseQualities(recalQuals);
System.out.printf("NEW: %s%n", read.format());
return recalRead;
}
public void onTraversalDone(SAMFileWriter output) {
if ( output != null ) {
output.close();
}
}
public SAMFileWriter reduceInit() {
if ( outputFile != null ) { // ! outputFile.equals("") ) {
SAMFileWriterFactory fact = new SAMFileWriterFactory();
SAMFileHeader header = this.getToolkit().getSamReader().getFileHeader();
return fact.makeBAMWriter(header, true, new File(outputFile));
}
else {
return null;
}
}
/**
* Summarize the error rate data.
*
*/
public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) {
if ( output != null ) {
output.addAlignment(read);
} else {
out.println(read.format());
}
return output;
}
}

View File

@ -0,0 +1,67 @@
package org.broadinstitute.sting.utils;
import net.sf.samtools.SAMFileReader;
import java.util.HashMap;
import java.util.ArrayList;
import java.io.File;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Apr 23, 2009
* Time: 3:46:30 PM
* To change this template use File | Settings | File Templates.
*/
public class LogisticRegressor {
double[][] coefficients;
int nFeatures;
int order;
public LogisticRegressor(int nFeatures, int order) {
this.nFeatures = nFeatures;
this.order = order;
if ( nFeatures != 2 )
throw new IllegalArgumentException("LogisticRegressor currently only supports 2 features :-(");
// setup coefficient matrix
coefficients = new double[order+1][order+1];
for ( int i = 0; i < order; i++ ) {
for ( int j = 0; j < order; j++ ) {
coefficients[i][j] = 0.0;
}
}
}
public double[][] getCoefficients() {
return coefficients;
}
public void setCoefficient(int i, int j, double c) {
coefficients[i][j] = c;
}
public double regress(double f1, double f2) {
double v = 0.0;
for ( int i = 0; i < order; i++ ) {
for ( int j = 0; j < order; j++ ) {
double c = coefficients[i][j];
v += c * Math.pow(f1,i) * Math.pow(f2, j);
}
}
return v;
}
public String toString() {
StringBuilder s = new StringBuilder();
s.append(String.format("nFeatures=%d, order=%d: ", nFeatures, order));
for ( int i = 0; i < order; i++ ) {
for ( int j = 0; j < order; j++ ) {
s.append(" " + coefficients[i][j]);
}
}
return s.toString();
}
}

View File

@ -27,7 +27,7 @@ def main():
type="string", default=None,
help="Farm queue to submit jobs to. Leave blank for local processing")
parser.add_option("-e", "--ignoreExistingFiles", dest="ignoreExistingFiles",
action='store_true', default=True,
action='store_true', default=False,
help="Ignores the existing files")
parser.add_option("-a", "--rebuildAllFiles", dest="rebuildAllFiles",
action='store_true', default=False,
@ -100,6 +100,7 @@ def main():
farm_commands.cmd(cmd, None, None)
if not os.path.exists(validationOutput) or OPTIONS.ignoreExistingFiles:
print validationOutput, 'does not exist'
analysis = "ValidatingPileup"
cmd = "java -ea -Xmx1024m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -T " + analysis + " -I " + subBAM + " -R " + ref + " -l INFO -S SILENT -U -B pileup SAMPileup " + pileup
print cmd

View File

@ -14,12 +14,12 @@
/broad/1KG/pilot3/sams/NA12892.bam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta *
# anthony PCR lane
# samtools import /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.bam.ref_list /seq/dirseq/aphilipp/combo/sequences/pcr/samfiles/10035.5.clean.sam 10035.5.clean.bam
/humgen/gsa-scr1/GATK_Data/Validation_Data/10035.5.clean.bam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta *
# samtools import /seq/references/Homo_sapiens_assembly17/v0/Homo_sapiens_assembly17.bam.ref_list /seq/dirseq/aphilipp/combo/sequences/pcr/samfiles/10035.5.clean.sam 10035.5.clean.bam
/humgen/gsa-scr1/GATK_Data/Validation_Data/10035.5.clean.bam /humgen/gsa-scr1/GATK_Data/Validation_Data/Homo_sapiens_assembly17.fasta *
# anthony HS lane
# samtools import /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.bam.ref_list /seq/dirseq/aphilipp/combo/sequences/hs/samfiles/30CLA.5.clean.sam 30CLA.5.clean.bam
/humgen/gsa-scr1/GATK_Data/Validation_Data/30CLA.5.clean.bam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta *
# samtools import /seq/references/Homo_sapiens_assembly17/v0/Homo_sapiens_assembly17.bam.ref_list /seq/dirseq/aphilipp/combo/sequences/hs/samfiles/30CLA.5.clean.sam 30CLA.5.clean.bam
/humgen/gsa-scr1/GATK_Data/Validation_Data/30CLA.5.clean.bam /humgen/gsa-scr1/GATK_Data/Validation_Data/Homo_sapiens_assembly17.fasta *
# WGS tumor -- figure it out
#/broad/1KG/pilot3/sams/NA12892.bam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta *

View File

@ -0,0 +1,17 @@
dinuc 0,0 1,0 2,0 3,0 4,0 1,1 2,1 3,1 4,1
AA 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
AT 0.0 5.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
AC 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
AG 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
TA 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
TT 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
TC 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
TG 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
CA 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
CT 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
CC 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
CG 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
GA 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
GT 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
GC 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
GG 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0