From 40a2b3eeb3c3d102d1a068ed13d288c8c8c9761b Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 24 Apr 2009 19:09:50 +0000 Subject: [PATCH] Basic logistic regression support for calibrating qualities; mostly for Andrew to experiment with git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@529 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/CovariateCounterWalker.java | 21 +-- .../walkers/LogisticRecalibrationWalker.java | 157 ++++++++++++++++++ .../sting/utils/LogisticRegressor.java | 67 ++++++++ python/ValidateGATK.py | 3 +- testdata/ValidatingPileupTargets.list | 8 +- testdata/logisticParamsTest.list | 17 ++ 6 files changed, 256 insertions(+), 17 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/LogisticRecalibrationWalker.java create mode 100755 java/src/org/broadinstitute/sting/utils/LogisticRegressor.java create mode 100755 testdata/logisticParamsTest.list diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java index 0db597a9e..695eecbe7 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java @@ -29,8 +29,8 @@ public class CovariateCounterWalker extends LocusWalker { //RecalData[][][] data = new RecalData; ArrayList flattenData = new ArrayList(); - int nuc2num[]; - char num2nuc[]; + static int nuc2num[]; + static char num2nuc[]; String dinuc_root = "dinuc"; ArrayList dinuc_outs = new ArrayList(); @@ -125,16 +125,16 @@ public class CovariateCounterWalker extends LocusWalker { return 0; } - int bases2dinucIndex(char prevBase, char base) { + public int bases2dinucIndex(char prevBase, char base) { return nuc2num[prevBase] * 4 + nuc2num[base]; } - String dinucIndex2bases(int index) { + public String dinucIndex2bases(int index) { char data[] = {num2nuc[index / 4], num2nuc[index % 4]}; return new String( data ); } - public CovariateCounterWalker() throws FileNotFoundException { + static { nuc2num = new int[128]; nuc2num['A'] = 0; nuc2num['C'] = 1; @@ -150,17 +150,14 @@ public class CovariateCounterWalker extends LocusWalker { num2nuc[1] = 'C'; num2nuc[2] = 'G'; num2nuc[3] = 'T'; + } - // Print out data for regression + // Print out data for regression + public CovariateCounterWalker() throws FileNotFoundException { for ( int i=0; i { + @Argument(shortName="logisticParams", required=true) + public String logisticParamsFile; + + @Argument(shortName="outputBAM", required=false, defaultValue="") + public String outputFile; + + HashMap regressors = new HashMap(); + private static Logger logger = Logger.getLogger(LogisticRecalibrationWalker.class); + + public void initialize() { + try { + List lines = new xReadLines(new File(logisticParamsFile)).readLines(); + ArrayList> mapping = parseHeader(lines.get(0)); + for ( String line : lines.subList(1,lines.size()) ) { + // dinuc coeff1 coeff2 ... coeff25 + String[] vals = line.split("\\s+"); + String dinuc = vals[0]; + LogisticRegressor regressor = new LogisticRegressor(2, 4); + regressors.put(dinuc, regressor); + System.out.printf("Vals = %s%n", Utils.join(",", vals)); + for ( int i = 1; i < vals.length; i++ ) { + Pair ij = mapping.get(i-1); + try { + double c = Double.parseDouble(vals[i]); + regressor.setCoefficient(ij.first, ij.second, c); + System.out.printf("Setting coefficient %s => %s = %f%n", dinuc, ij, c); + } catch ( NumberFormatException e ) { + Utils.scareUser("Badly formed logistic regression header at " + vals[i] + " line: " + line ); + } + } + } + + for ( Map.Entry e : regressors.entrySet() ) { + String dinuc = e.getKey(); + LogisticRegressor regressor = e.getValue(); + logger.debug(String.format("Regressor: %s => %s", dinuc, regressor)); + } + + //System.exit(1); + } catch ( FileNotFoundException e ) { + Utils.scareUser("Cannot read/find logistic parameters file " + logisticParamsFile); + } + } + + // damn I hate parsing lines + private ArrayList> parseHeader(String headerLine) { + ArrayList> mapping = new ArrayList>(); + + String[] elts = headerLine.split("\\s+"); + if ( ! elts[0].toLowerCase().equals("dinuc") ) + Utils.scareUser("Badly formatted Logistic regression header, upper left keyword must be dinuc: " + elts[0] + " line: " + headerLine); + + for ( int k = 1; k < elts.length; k++ ) { + String paramStr = elts[k]; + String[] ij = paramStr.split(","); + if ( ij.length != 2 ) { + Utils.scareUser("Badly formed logistic regression header at " + paramStr + " line: " + headerLine); + } else { + try { + int i = Integer.parseInt(ij[0]); + int j = Integer.parseInt(ij[1]); + mapping.add(new Pair(i,j)); + logger.info(String.format("%d => %d/%d", k, i, j)); + } catch ( NumberFormatException e ) { + Utils.scareUser("Badly formed logistic regression header at " + paramStr + " line: " + headerLine ); + } + } + } + + return mapping; + } + + + public SAMRecord map(LocusContext context, SAMRecord read) { + SAMRecord recalRead = read; + byte[] bases = read.getReadBases(); + byte[] quals = read.getBaseQualities(); + byte[] recalQuals = new byte[quals.length]; + + recalQuals[0] = quals[0]; // can't change the first -- no dinuc + for ( int i = 1; i < bases.length; i++ ) { + int cycle = i; + byte qual = quals[i]; + String dinuc = String.format("%c%c", bases[i-1], bases[i]); + //System.out.printf("dinuc %c %c%n", bases[i-1], bases[i]); + LogisticRegressor regressor = regressors.get(dinuc); + double P = -1; + byte newQual = qual; + + if ( regressor != null ) { // no N or some other unexpected bp in the stream + double logPOver1minusP = regressor.regress((double)cycle+1, (double)qual); + double POver1minusP = Math.pow(10, logPOver1minusP); + P = POver1minusP / (1 + POver1minusP); + newQual = QualityUtils.probToQual(P); + //newQual = (byte)Math.min(Math.round(newQualDouble),63); + System.out.printf("Recal %s %d %d => %f => %f => %f leads to %d%n", dinuc, cycle, qual, logPOver1minusP, POver1minusP, P, newQual); + } + + recalQuals[i] = newQual; + } + + System.out.printf("OLD: %s%n", read.format()); + read.setBaseQualities(recalQuals); + System.out.printf("NEW: %s%n", read.format()); + return recalRead; + } + + public void onTraversalDone(SAMFileWriter output) { + if ( output != null ) { + output.close(); + } + } + + public SAMFileWriter reduceInit() { + if ( outputFile != null ) { // ! outputFile.equals("") ) { + SAMFileWriterFactory fact = new SAMFileWriterFactory(); + SAMFileHeader header = this.getToolkit().getSamReader().getFileHeader(); + return fact.makeBAMWriter(header, true, new File(outputFile)); + } + else { + return null; + } + } + + /** + * Summarize the error rate data. + * + */ + public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) { + if ( output != null ) { + output.addAlignment(read); + } else { + out.println(read.format()); + } + + return output; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/LogisticRegressor.java b/java/src/org/broadinstitute/sting/utils/LogisticRegressor.java new file mode 100755 index 000000000..cdf6ab013 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/LogisticRegressor.java @@ -0,0 +1,67 @@ +package org.broadinstitute.sting.utils; + +import net.sf.samtools.SAMFileReader; + +import java.util.HashMap; +import java.util.ArrayList; +import java.io.File; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: Apr 23, 2009 + * Time: 3:46:30 PM + * To change this template use File | Settings | File Templates. + */ +public class LogisticRegressor { + double[][] coefficients; + int nFeatures; + int order; + + public LogisticRegressor(int nFeatures, int order) { + this.nFeatures = nFeatures; + this.order = order; + + if ( nFeatures != 2 ) + throw new IllegalArgumentException("LogisticRegressor currently only supports 2 features :-("); + + // setup coefficient matrix + coefficients = new double[order+1][order+1]; + for ( int i = 0; i < order; i++ ) { + for ( int j = 0; j < order; j++ ) { + coefficients[i][j] = 0.0; + } + } + } + + public double[][] getCoefficients() { + return coefficients; + } + + public void setCoefficient(int i, int j, double c) { + coefficients[i][j] = c; + } + + public double regress(double f1, double f2) { + double v = 0.0; + for ( int i = 0; i < order; i++ ) { + for ( int j = 0; j < order; j++ ) { + double c = coefficients[i][j]; + v += c * Math.pow(f1,i) * Math.pow(f2, j); + } + } + return v; + } + + public String toString() { + StringBuilder s = new StringBuilder(); + s.append(String.format("nFeatures=%d, order=%d: ", nFeatures, order)); + for ( int i = 0; i < order; i++ ) { + for ( int j = 0; j < order; j++ ) { + s.append(" " + coefficients[i][j]); + } + } + + return s.toString(); + } +} diff --git a/python/ValidateGATK.py b/python/ValidateGATK.py index ab1512295..984adef44 100755 --- a/python/ValidateGATK.py +++ b/python/ValidateGATK.py @@ -27,7 +27,7 @@ def main(): type="string", default=None, help="Farm queue to submit jobs to. Leave blank for local processing") parser.add_option("-e", "--ignoreExistingFiles", dest="ignoreExistingFiles", - action='store_true', default=True, + action='store_true', default=False, help="Ignores the existing files") parser.add_option("-a", "--rebuildAllFiles", dest="rebuildAllFiles", action='store_true', default=False, @@ -100,6 +100,7 @@ def main(): farm_commands.cmd(cmd, None, None) if not os.path.exists(validationOutput) or OPTIONS.ignoreExistingFiles: + print validationOutput, 'does not exist' analysis = "ValidatingPileup" cmd = "java -ea -Xmx1024m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -T " + analysis + " -I " + subBAM + " -R " + ref + " -l INFO -S SILENT -U -B pileup SAMPileup " + pileup print cmd diff --git a/testdata/ValidatingPileupTargets.list b/testdata/ValidatingPileupTargets.list index 6ea3879d0..499bd4e6e 100755 --- a/testdata/ValidatingPileupTargets.list +++ b/testdata/ValidatingPileupTargets.list @@ -14,12 +14,12 @@ /broad/1KG/pilot3/sams/NA12892.bam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta * # anthony PCR lane -# samtools import /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.bam.ref_list /seq/dirseq/aphilipp/combo/sequences/pcr/samfiles/10035.5.clean.sam 10035.5.clean.bam -/humgen/gsa-scr1/GATK_Data/Validation_Data/10035.5.clean.bam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta * +# samtools import /seq/references/Homo_sapiens_assembly17/v0/Homo_sapiens_assembly17.bam.ref_list /seq/dirseq/aphilipp/combo/sequences/pcr/samfiles/10035.5.clean.sam 10035.5.clean.bam +/humgen/gsa-scr1/GATK_Data/Validation_Data/10035.5.clean.bam /humgen/gsa-scr1/GATK_Data/Validation_Data/Homo_sapiens_assembly17.fasta * # anthony HS lane -# samtools import /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.bam.ref_list /seq/dirseq/aphilipp/combo/sequences/hs/samfiles/30CLA.5.clean.sam 30CLA.5.clean.bam -/humgen/gsa-scr1/GATK_Data/Validation_Data/30CLA.5.clean.bam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta * +# samtools import /seq/references/Homo_sapiens_assembly17/v0/Homo_sapiens_assembly17.bam.ref_list /seq/dirseq/aphilipp/combo/sequences/hs/samfiles/30CLA.5.clean.sam 30CLA.5.clean.bam +/humgen/gsa-scr1/GATK_Data/Validation_Data/30CLA.5.clean.bam /humgen/gsa-scr1/GATK_Data/Validation_Data/Homo_sapiens_assembly17.fasta * # WGS tumor -- figure it out #/broad/1KG/pilot3/sams/NA12892.bam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta * diff --git a/testdata/logisticParamsTest.list b/testdata/logisticParamsTest.list new file mode 100755 index 000000000..c56f53dae --- /dev/null +++ b/testdata/logisticParamsTest.list @@ -0,0 +1,17 @@ +dinuc 0,0 1,0 2,0 3,0 4,0 1,1 2,1 3,1 4,1 +AA 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 +AT 0.0 5.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 +AC 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 +AG 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 +TA 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 +TT 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 +TC 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 +TG 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 +CA 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 +CT 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 +CC 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 +CG 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 +GA 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 +GT 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 +GC 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 +GG 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0