Basic logistic regression support for calibrating qualities; mostly for Andrew to experiment with
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@529 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
38c2f73457
commit
40a2b3eeb3
|
|
@ -29,8 +29,8 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
//RecalData[][][] data = new RecalData;
|
||||
ArrayList<RecalData> flattenData = new ArrayList();
|
||||
|
||||
int nuc2num[];
|
||||
char num2nuc[];
|
||||
static int nuc2num[];
|
||||
static char num2nuc[];
|
||||
|
||||
String dinuc_root = "dinuc";
|
||||
ArrayList<PrintStream> dinuc_outs = new ArrayList<PrintStream>();
|
||||
|
|
@ -125,16 +125,16 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
return 0;
|
||||
}
|
||||
|
||||
int bases2dinucIndex(char prevBase, char base) {
|
||||
public int bases2dinucIndex(char prevBase, char base) {
|
||||
return nuc2num[prevBase] * 4 + nuc2num[base];
|
||||
}
|
||||
|
||||
String dinucIndex2bases(int index) {
|
||||
public String dinucIndex2bases(int index) {
|
||||
char data[] = {num2nuc[index / 4], num2nuc[index % 4]};
|
||||
return new String( data );
|
||||
}
|
||||
|
||||
public CovariateCounterWalker() throws FileNotFoundException {
|
||||
static {
|
||||
nuc2num = new int[128];
|
||||
nuc2num['A'] = 0;
|
||||
nuc2num['C'] = 1;
|
||||
|
|
@ -150,17 +150,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
num2nuc[1] = 'C';
|
||||
num2nuc[2] = 'G';
|
||||
num2nuc[3] = 'T';
|
||||
}
|
||||
|
||||
// Print out data for regression
|
||||
// Print out data for regression
|
||||
public CovariateCounterWalker() throws FileNotFoundException {
|
||||
for ( int i=0; i<NDINUCS; i++) {
|
||||
PrintStream next_dinuc = new PrintStream( dinuc_root+"."+dinucIndex2bases(i)+".csv");
|
||||
next_dinuc.println("logitQ,pos,indicator");
|
||||
dinuc_outs.add(next_dinuc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,157 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
@WalkerName("LogisticRecalibration")
|
||||
public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||
@Argument(shortName="logisticParams", required=true)
|
||||
public String logisticParamsFile;
|
||||
|
||||
@Argument(shortName="outputBAM", required=false, defaultValue="")
|
||||
public String outputFile;
|
||||
|
||||
HashMap<String, LogisticRegressor> regressors = new HashMap<String, LogisticRegressor>();
|
||||
private static Logger logger = Logger.getLogger(LogisticRecalibrationWalker.class);
|
||||
|
||||
public void initialize() {
|
||||
try {
|
||||
List<String> lines = new xReadLines(new File(logisticParamsFile)).readLines();
|
||||
ArrayList<Pair<Integer, Integer>> mapping = parseHeader(lines.get(0));
|
||||
for ( String line : lines.subList(1,lines.size()) ) {
|
||||
// dinuc coeff1 coeff2 ... coeff25
|
||||
String[] vals = line.split("\\s+");
|
||||
String dinuc = vals[0];
|
||||
LogisticRegressor regressor = new LogisticRegressor(2, 4);
|
||||
regressors.put(dinuc, regressor);
|
||||
System.out.printf("Vals = %s%n", Utils.join(",", vals));
|
||||
for ( int i = 1; i < vals.length; i++ ) {
|
||||
Pair<Integer, Integer> ij = mapping.get(i-1);
|
||||
try {
|
||||
double c = Double.parseDouble(vals[i]);
|
||||
regressor.setCoefficient(ij.first, ij.second, c);
|
||||
System.out.printf("Setting coefficient %s => %s = %f%n", dinuc, ij, c);
|
||||
} catch ( NumberFormatException e ) {
|
||||
Utils.scareUser("Badly formed logistic regression header at " + vals[i] + " line: " + line );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for ( Map.Entry<String, LogisticRegressor> e : regressors.entrySet() ) {
|
||||
String dinuc = e.getKey();
|
||||
LogisticRegressor regressor = e.getValue();
|
||||
logger.debug(String.format("Regressor: %s => %s", dinuc, regressor));
|
||||
}
|
||||
|
||||
//System.exit(1);
|
||||
} catch ( FileNotFoundException e ) {
|
||||
Utils.scareUser("Cannot read/find logistic parameters file " + logisticParamsFile);
|
||||
}
|
||||
}
|
||||
|
||||
// damn I hate parsing lines
|
||||
private ArrayList<Pair<Integer, Integer>> parseHeader(String headerLine) {
|
||||
ArrayList<Pair<Integer, Integer>> mapping = new ArrayList<Pair<Integer, Integer>>();
|
||||
|
||||
String[] elts = headerLine.split("\\s+");
|
||||
if ( ! elts[0].toLowerCase().equals("dinuc") )
|
||||
Utils.scareUser("Badly formatted Logistic regression header, upper left keyword must be dinuc: " + elts[0] + " line: " + headerLine);
|
||||
|
||||
for ( int k = 1; k < elts.length; k++ ) {
|
||||
String paramStr = elts[k];
|
||||
String[] ij = paramStr.split(",");
|
||||
if ( ij.length != 2 ) {
|
||||
Utils.scareUser("Badly formed logistic regression header at " + paramStr + " line: " + headerLine);
|
||||
} else {
|
||||
try {
|
||||
int i = Integer.parseInt(ij[0]);
|
||||
int j = Integer.parseInt(ij[1]);
|
||||
mapping.add(new Pair<Integer, Integer>(i,j));
|
||||
logger.info(String.format("%d => %d/%d", k, i, j));
|
||||
} catch ( NumberFormatException e ) {
|
||||
Utils.scareUser("Badly formed logistic regression header at " + paramStr + " line: " + headerLine );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return mapping;
|
||||
}
|
||||
|
||||
|
||||
public SAMRecord map(LocusContext context, SAMRecord read) {
|
||||
SAMRecord recalRead = read;
|
||||
byte[] bases = read.getReadBases();
|
||||
byte[] quals = read.getBaseQualities();
|
||||
byte[] recalQuals = new byte[quals.length];
|
||||
|
||||
recalQuals[0] = quals[0]; // can't change the first -- no dinuc
|
||||
for ( int i = 1; i < bases.length; i++ ) {
|
||||
int cycle = i;
|
||||
byte qual = quals[i];
|
||||
String dinuc = String.format("%c%c", bases[i-1], bases[i]);
|
||||
//System.out.printf("dinuc %c %c%n", bases[i-1], bases[i]);
|
||||
LogisticRegressor regressor = regressors.get(dinuc);
|
||||
double P = -1;
|
||||
byte newQual = qual;
|
||||
|
||||
if ( regressor != null ) { // no N or some other unexpected bp in the stream
|
||||
double logPOver1minusP = regressor.regress((double)cycle+1, (double)qual);
|
||||
double POver1minusP = Math.pow(10, logPOver1minusP);
|
||||
P = POver1minusP / (1 + POver1minusP);
|
||||
newQual = QualityUtils.probToQual(P);
|
||||
//newQual = (byte)Math.min(Math.round(newQualDouble),63);
|
||||
System.out.printf("Recal %s %d %d => %f => %f => %f leads to %d%n", dinuc, cycle, qual, logPOver1minusP, POver1minusP, P, newQual);
|
||||
}
|
||||
|
||||
recalQuals[i] = newQual;
|
||||
}
|
||||
|
||||
System.out.printf("OLD: %s%n", read.format());
|
||||
read.setBaseQualities(recalQuals);
|
||||
System.out.printf("NEW: %s%n", read.format());
|
||||
return recalRead;
|
||||
}
|
||||
|
||||
public void onTraversalDone(SAMFileWriter output) {
|
||||
if ( output != null ) {
|
||||
output.close();
|
||||
}
|
||||
}
|
||||
|
||||
public SAMFileWriter reduceInit() {
|
||||
if ( outputFile != null ) { // ! outputFile.equals("") ) {
|
||||
SAMFileWriterFactory fact = new SAMFileWriterFactory();
|
||||
SAMFileHeader header = this.getToolkit().getSamReader().getFileHeader();
|
||||
return fact.makeBAMWriter(header, true, new File(outputFile));
|
||||
}
|
||||
else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Summarize the error rate data.
|
||||
*
|
||||
*/
|
||||
public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) {
|
||||
if ( output != null ) {
|
||||
output.addAlignment(read);
|
||||
} else {
|
||||
out.println(read.format());
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.ArrayList;
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: Apr 23, 2009
|
||||
* Time: 3:46:30 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class LogisticRegressor {
|
||||
double[][] coefficients;
|
||||
int nFeatures;
|
||||
int order;
|
||||
|
||||
public LogisticRegressor(int nFeatures, int order) {
|
||||
this.nFeatures = nFeatures;
|
||||
this.order = order;
|
||||
|
||||
if ( nFeatures != 2 )
|
||||
throw new IllegalArgumentException("LogisticRegressor currently only supports 2 features :-(");
|
||||
|
||||
// setup coefficient matrix
|
||||
coefficients = new double[order+1][order+1];
|
||||
for ( int i = 0; i < order; i++ ) {
|
||||
for ( int j = 0; j < order; j++ ) {
|
||||
coefficients[i][j] = 0.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public double[][] getCoefficients() {
|
||||
return coefficients;
|
||||
}
|
||||
|
||||
public void setCoefficient(int i, int j, double c) {
|
||||
coefficients[i][j] = c;
|
||||
}
|
||||
|
||||
public double regress(double f1, double f2) {
|
||||
double v = 0.0;
|
||||
for ( int i = 0; i < order; i++ ) {
|
||||
for ( int j = 0; j < order; j++ ) {
|
||||
double c = coefficients[i][j];
|
||||
v += c * Math.pow(f1,i) * Math.pow(f2, j);
|
||||
}
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder s = new StringBuilder();
|
||||
s.append(String.format("nFeatures=%d, order=%d: ", nFeatures, order));
|
||||
for ( int i = 0; i < order; i++ ) {
|
||||
for ( int j = 0; j < order; j++ ) {
|
||||
s.append(" " + coefficients[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
return s.toString();
|
||||
}
|
||||
}
|
||||
|
|
@ -27,7 +27,7 @@ def main():
|
|||
type="string", default=None,
|
||||
help="Farm queue to submit jobs to. Leave blank for local processing")
|
||||
parser.add_option("-e", "--ignoreExistingFiles", dest="ignoreExistingFiles",
|
||||
action='store_true', default=True,
|
||||
action='store_true', default=False,
|
||||
help="Ignores the existing files")
|
||||
parser.add_option("-a", "--rebuildAllFiles", dest="rebuildAllFiles",
|
||||
action='store_true', default=False,
|
||||
|
|
@ -100,6 +100,7 @@ def main():
|
|||
farm_commands.cmd(cmd, None, None)
|
||||
|
||||
if not os.path.exists(validationOutput) or OPTIONS.ignoreExistingFiles:
|
||||
print validationOutput, 'does not exist'
|
||||
analysis = "ValidatingPileup"
|
||||
cmd = "java -ea -Xmx1024m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -T " + analysis + " -I " + subBAM + " -R " + ref + " -l INFO -S SILENT -U -B pileup SAMPileup " + pileup
|
||||
print cmd
|
||||
|
|
|
|||
|
|
@ -14,12 +14,12 @@
|
|||
/broad/1KG/pilot3/sams/NA12892.bam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta *
|
||||
|
||||
# anthony PCR lane
|
||||
# samtools import /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.bam.ref_list /seq/dirseq/aphilipp/combo/sequences/pcr/samfiles/10035.5.clean.sam 10035.5.clean.bam
|
||||
/humgen/gsa-scr1/GATK_Data/Validation_Data/10035.5.clean.bam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta *
|
||||
# samtools import /seq/references/Homo_sapiens_assembly17/v0/Homo_sapiens_assembly17.bam.ref_list /seq/dirseq/aphilipp/combo/sequences/pcr/samfiles/10035.5.clean.sam 10035.5.clean.bam
|
||||
/humgen/gsa-scr1/GATK_Data/Validation_Data/10035.5.clean.bam /humgen/gsa-scr1/GATK_Data/Validation_Data/Homo_sapiens_assembly17.fasta *
|
||||
|
||||
# anthony HS lane
|
||||
# samtools import /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.bam.ref_list /seq/dirseq/aphilipp/combo/sequences/hs/samfiles/30CLA.5.clean.sam 30CLA.5.clean.bam
|
||||
/humgen/gsa-scr1/GATK_Data/Validation_Data/30CLA.5.clean.bam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta *
|
||||
# samtools import /seq/references/Homo_sapiens_assembly17/v0/Homo_sapiens_assembly17.bam.ref_list /seq/dirseq/aphilipp/combo/sequences/hs/samfiles/30CLA.5.clean.sam 30CLA.5.clean.bam
|
||||
/humgen/gsa-scr1/GATK_Data/Validation_Data/30CLA.5.clean.bam /humgen/gsa-scr1/GATK_Data/Validation_Data/Homo_sapiens_assembly17.fasta *
|
||||
|
||||
# WGS tumor -- figure it out
|
||||
#/broad/1KG/pilot3/sams/NA12892.bam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta *
|
||||
|
|
|
|||
|
|
@ -0,0 +1,17 @@
|
|||
dinuc 0,0 1,0 2,0 3,0 4,0 1,1 2,1 3,1 4,1
|
||||
AA 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
AT 0.0 5.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
AC 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
AG 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
TA 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
TT 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
TC 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
TG 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
CA 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
CT 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
CC 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
CG 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
GA 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
GT 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
GC 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
GG 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
|
||||
Loading…
Reference in New Issue