From 8125b8b901c933ee809bf8476a2c1b3279791687 Mon Sep 17 00:00:00 2001 From: chartl Date: Tue, 12 Apr 2011 23:00:50 +0000 Subject: [PATCH] Old changes to the exome VQSR search. SGA updated to include new proportion-based insert size test. Major fix for dichotomization test: MathUtils now optionally ignores NaN values for sums, averages, variances. In the future this feature can be pushed back into the AssociationContext object iself (e.g. no data? no entry), but it's kept like this for transparency for now. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5618 348d0f76-0448-11de-a6fe-93d51630548a --- .../association/AssociationContext.java | 6 ++ .../association/AssociationTestRunner.java | 10 ++- .../RegionalAssociationHandler.java | 78 +++++++++++++------ .../RegionalAssociationWalker.java | 11 +-- .../modules/casecontrol/ValueTest.java | 12 +-- .../statistics/Dichotomizer1D.java | 19 ++++- .../broadinstitute/sting/utils/MathUtils.java | 44 +++++++++-- .../oneoffs/chartl/Exome_VQSR_FullSearch.q | 13 +++- .../oneoffs/chartl/ScatterGatherAssociation.q | 3 + 9 files changed, 146 insertions(+), 50 deletions(-) diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationContext.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationContext.java index 3c241ca37..b3659f317 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationContext.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationContext.java @@ -64,6 +64,9 @@ public abstract class AssociationContext { window.add(sampleData); } + public void flush() { + window = new ArrayList>(size); + } public boolean isFull() { return window.size() >= size; @@ -78,4 +81,7 @@ public abstract class AssociationContext { public int getWindowSize() { return window.size(); } + + public boolean testOnFlush() { return true; } + public boolean canFlush() { return true; } } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationTestRunner.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationTestRunner.java index 4dfa8873e..dbc2ba339 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationTestRunner.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationTestRunner.java @@ -24,9 +24,17 @@ public class AssociationTestRunner { final static int MAX_Q_VALUE = Integer.MAX_VALUE; // todo -- this was written when ACs could implement interfaces, now that they extend, there's no multiple inheritance final static Dichotomizer1D.Transform LOG_TRANSFORM = (new Dichotomizer1D()).new Transform() { + private double EPSILON = 1e-20; @Override public double transform(double d) { - return Math.log(d); + return Math.log(d+EPSILON); + } + }; + + final static Dichotomizer1D.Transform ARCSINE_TRANSFORM = (new Dichotomizer1D()).new Transform() { + @Override + public double transform(double d) { + return Math.asin(1.0-d); } }; diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationHandler.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationHandler.java index c8ef6b7fd..9baa30e95 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationHandler.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationHandler.java @@ -15,10 +15,12 @@ public class RegionalAssociationHandler { private MapExtender maps; // todo -- the correct way to do this is via the PluginManager (a la VariantEval) but this is a QND implementation private Set associations; + private boolean bedGraphFormat; - public RegionalAssociationHandler(Set contexts, Set samples) { + public RegionalAssociationHandler(Set contexts, Set samples, boolean bedGraph) { maps = new MapExtender(samples); associations = contexts; + bedGraphFormat = bedGraph; } public void updateExtender(MapHolder mapHolder) { @@ -32,45 +34,75 @@ public class RegionalAssociationHandler { * @reduce: (List,AssociationContext) --> List * @implementation: just append */ - public void runMapReduce() { + public Map runMapReduce() { + Map testResults = new HashMap(); + if ( maps.getPreviousRef() != null && maps.getPreviousRef().getLocus().compareTo(getLocation()) != -1 ) { + testResults.putAll(flush()); + } for ( AssociationContext w : associations ) { if ( w.filter(maps) ) { w.addData(w.mapLocus(maps)); } + + if ( w.isFull() ) { + testResults.put(w,runTest(w)); + w.slide(); + } } + + return testResults; } /** * Switches what formatting to use based on wiggle or standard - * @param bedGraphFormat - use bedgraph format (s p q) or standard (S: s P: p Q: q) + * implicit param bedGraphFormat - use bedgraph format (s p q) or standard (S: s P: p Q: q) * @return - test results in proper format */ - public Map runTests(boolean bedGraphFormat) { + public String runTest(AssociationContext context) { // todo -- maybe the tdf should be the whole window rather than just the most recent loc? - Map testResults = new HashMap(associations.size()); - for ( AssociationContext w : associations ) { - if ( w.isFull() ) { - String outVal; - if ( bedGraphFormat ) { - Pair> statVals = AssociationTestRunner.getTestValues(w); - Pair simpleDichotVals = AssociationTestRunner.getDichotomizedValues(w); - outVal = String.format("%.2f\t%.2e\t%d\t%.2f\t%.2f",statVals.first,statVals.second.first,statVals.second.second, - simpleDichotVals.first,simpleDichotVals.second); - } else { - outVal = AssociationTestRunner.runTests(w); - Pair simpleDichotVals = AssociationTestRunner.getDichotomizedValues(w); - outVal += String.format("\tD: %.2f\tLogD: %.2f",simpleDichotVals.first,simpleDichotVals.second); - } - testResults.put(w,String.format("%s\t%d\t%d\t%s",maps.getReferenceContext().getLocus().getContig(), - maps.getReferenceContext().getLocus().getStart()-w.getWindowSize()-1,maps.getReferenceContext().getLocus().getStart()+1, outVal)); - w.slide(); - } + String outVal; + if ( bedGraphFormat ) { + Pair> statVals = AssociationTestRunner.getTestValues(context); + Pair simpleDichotVals = AssociationTestRunner.getDichotomizedValues(context); + outVal = String.format("%.2f\t%.2e\t%d\t%.2f\t%.2f",statVals.first,statVals.second.first,statVals.second.second, + simpleDichotVals.first,simpleDichotVals.second); + } else { + outVal = AssociationTestRunner.runTests(context); + Pair simpleDichotVals = AssociationTestRunner.getDichotomizedValues(context); + outVal += String.format("\tD: %.2f\tLogD: %.2f",simpleDichotVals.first,simpleDichotVals.second); } - return testResults; + return String.format("%s\t%d\t%d\t%s",maps.getReferenceContext().getLocus().getContig(), + maps.getReferenceContext().getLocus().getStart()-context.getWindowSize()-1,maps.getReferenceContext().getLocus().getStart()+1, outVal); + } public GenomeLoc getLocation() { return maps.getReferenceContext().getLocus(); } + /** + * Flushes context on a jump between intervals. Can not return null. + * @return on-flush tests + */ + public Map flush() { + Map flushTests = new HashMap(); + for ( AssociationContext ac : associations ) { + if ( ac.canFlush() ) { + if ( ac.testOnFlush() ) { + flushTests.put(ac,runTest(ac)); + } + ac.flush(); + } + } + + return flushTests; + } + + public Set getAssociations() { + return associations; + } + + public MapExtender getExtender() { + return maps; + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationWalker.java index 8495b54a2..9595861b0 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationWalker.java @@ -65,7 +65,7 @@ public class RegionalAssociationWalker extends LocusWalker validAssociations = getAssociations(); - RegionalAssociationHandler wac = new RegionalAssociationHandler(validAssociations,getSamples()); + RegionalAssociationHandler wac = new RegionalAssociationHandler(validAssociations,getSamples(),bedGraph); return wac; } @@ -76,14 +76,15 @@ public class RegionalAssociationWalker extends LocusWalker testResults; try { - rac.runMapReduce(); + testResults = rac.runMapReduce(); } catch (Exception e) { throw new StingException("Error in map reduce",e); } - Map testsHere = rac.runTests(bedGraph); - if ( testsHere.size() > 0 ) { - for ( Map.Entry result : testsHere.entrySet() ) { + + if ( testResults.size() > 0 ) { + for ( Map.Entry result : testResults.entrySet() ) { out.get(result.getKey().getClass()).printf("%s%n",result.getValue()); } } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/modules/casecontrol/ValueTest.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/modules/casecontrol/ValueTest.java index 47690c1f4..5082e6f2f 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/modules/casecontrol/ValueTest.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/modules/casecontrol/ValueTest.java @@ -46,9 +46,9 @@ public abstract class ValueTest extends CaseControl> implemen for ( Map> sampleMap : window ) { for ( Map.Entry> sampleEntry : sampleMap.entrySet() ) { if ( sampleEntry.getKey().getProperty("cohort").equals("case") ) { - caseMeans.add(MathUtils.average(sampleEntry.getValue())); + caseMeans.add(MathUtils.average(sampleEntry.getValue(),true)); } else if ( sampleEntry.getKey().getProperty("cohort").equals("control") ) { - controlMeans.add(MathUtils.average(sampleEntry.getValue())); + controlMeans.add(MathUtils.average(sampleEntry.getValue(),true)); } } } @@ -83,11 +83,11 @@ public abstract class ValueTest extends CaseControl> implemen if ( caseControlVectors == null || caseControlVectors.get(CaseControl.Cohort.CASE) == null || caseControlVectors.get(CaseControl.Cohort.CONTROL) == null ) { return new Pair(Double.NaN,Double.NaN); } - double meanCase = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CASE)); - double varCase = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CASE),meanCase); + double meanCase = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CASE),true); + double varCase = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CASE),meanCase,true); double nCase = caseControlVectors.get(CaseControl.Cohort.CASE).size(); - double meanControl = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CONTROL)); - double varControl = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CONTROL),meanControl); + double meanControl = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CONTROL),true); + double varControl = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CONTROL),meanControl,true); double nControl = caseControlVectors.get(CaseControl.Cohort.CONTROL).size(); double df_num = Math.pow(varCase/nCase + varControl/nControl,2); diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/statistics/Dichotomizer1D.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/statistics/Dichotomizer1D.java index 5f7df6146..82922d316 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/statistics/Dichotomizer1D.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/statistics/Dichotomizer1D.java @@ -41,11 +41,22 @@ public class Dichotomizer1D { * @return - the so-called "Z"-factor (effect size/spread) */ public static double simpleGaussianDichotomy(Collection setOne, Collection setTwo) { - double meanOne = MathUtils.sum(setOne)/setOne.size(); - double meanTwo = MathUtils.sum(setTwo)/setTwo.size(); - double stdOne = Math.sqrt(MathUtils.variance(setOne, meanOne)); - double stdTwo = Math.sqrt(MathUtils.variance(setTwo, meanTwo)); + double meanOne = MathUtils.average(setOne,true); + double meanTwo = MathUtils.average(setTwo,true); + double stdOne = Math.sqrt(MathUtils.variance(setOne, meanOne,true)); + double stdTwo = Math.sqrt(MathUtils.variance(setTwo, meanTwo,true)); + /* + System.out.print("setOne: "); + for ( Number n : setOne ) { + System.out.printf(",%.2f",n.doubleValue()); + } + System.out.print("\tsetTwo: "); + for ( Number n : setTwo ) { + System.out.printf(",%.2f",n.doubleValue()); + } + System.out.printf("\tmn1: %.2f mn2: %.2f var1: %.2f var2: %.2f%n",meanOne,meanTwo,stdOne,stdTwo); + */ return 1.0 - (3.0*(stdOne+stdTwo))/Math.abs(meanOne-meanTwo); } diff --git a/java/src/org/broadinstitute/sting/utils/MathUtils.java b/java/src/org/broadinstitute/sting/utils/MathUtils.java index 245755de1..639335b1e 100755 --- a/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -52,28 +52,56 @@ public class MathUtils { /** Private constructor. No instantiating this class! */ private MathUtils() {} - public static double sum( Collection numbers ) { + public static double sum(Collection numbers) { + return sum(numbers,false); + } + + public static double sum( Collection numbers, boolean ignoreNan ) { double sum = 0; for ( Number n : numbers ) { - sum += n.doubleValue(); + if ( ! ignoreNan || ! Double.isNaN(n.doubleValue())) { + sum += n.doubleValue(); + } } return sum; } - public static double average( Collection numbers ) { - return sum(numbers)/numbers.size(); + public static int nonNanSize(Collection numbers) { + int size = 0; + for ( Number n : numbers) { + size += Double.isNaN(n.doubleValue()) ? 0 : 1; + } + + return size; } - public static double variance( Collection numbers, Number mean ) { + public static double average( Collection numbers, boolean ignoreNan) { + if ( ignoreNan ) { + return sum(numbers,true)/nonNanSize(numbers); + } else { + return sum(numbers,false)/nonNanSize(numbers); + } + } + + public static double variance( Collection numbers, Number mean, boolean ignoreNan ) { double mn = mean.doubleValue(); double var = 0; - for ( Number n : numbers ) { var += Math.pow( n.doubleValue() - mn , 2); } - return var; + for ( Number n : numbers ) { var += ( ! ignoreNan || ! Double.isNaN(n.doubleValue())) ? (n.doubleValue()-mn)*(n.doubleValue()-mn) : 0; } + if ( ignoreNan ) { return var/nonNanSize(numbers); } + return var/numbers.size(); + } + + public static double variance(Collection numbers, Number mean) { + return variance(numbers,mean,false); + } + + public static double variance(Collection numbers, boolean ignoreNan) { + return variance(numbers,average(numbers,ignoreNan),ignoreNan); } public static double variance(Collection numbers) { - return variance(numbers,average(numbers)); + return variance(numbers,average(numbers,false),false); } public static double sum(double[] values) { diff --git a/scala/qscript/oneoffs/chartl/Exome_VQSR_FullSearch.q b/scala/qscript/oneoffs/chartl/Exome_VQSR_FullSearch.q index f7a9482fd..aad6ae70b 100755 --- a/scala/qscript/oneoffs/chartl/Exome_VQSR_FullSearch.q +++ b/scala/qscript/oneoffs/chartl/Exome_VQSR_FullSearch.q @@ -31,7 +31,7 @@ class Exome_VQSR_FullSearch extends QScript { var VQSR_RODBINDS : HashMap[String,List[RodBind]] = new HashMap[String,List[RodBind]] val VQSR_TAG_FT = "known=false,training=true,truth=%s,prior=%s" - val VQSR_DBSNP_TAG = "known=true,training=false,truth=false" + val VQSR_DBSNP_TAG = "known=true,training=false,truth=false,prior=0.1" for ( tf <- List( (true,false),(false,true),(true,true)) ) { @@ -69,7 +69,7 @@ class Exome_VQSR_FullSearch extends QScript { trait ExpandedIntervals extends CommandLineGATK { if (EXPAND_INTS > 0) { - this.intervals :+= ei.outList + this.intervals :+= ei.outList.getAbsoluteFile } } @@ -177,7 +177,7 @@ class Exome_VQSR_FullSearch extends QScript { exons.tranchesFile = new File(nameFormat.format("exons")+"tranche") exons.recalFile = new File(nameFormat.format("exons")+"recal") var flanks = new ContrastiveRecalibrator with VQSR_Args - flanks.intervals :+= ei.outList + flanks.intervals :+= ei.outList.getAbsoluteFile flanks.jarFile = GATK_JAR flanks.memoryLimit = Some(8) flanks.reference_sequence = REF @@ -211,6 +211,12 @@ class Exome_VQSR_FullSearch extends QScript { trait ApplyArgs extends ApplyRecalibration with ImplicitArgs { this.tranchesFile = recal.tranchesFile this.recalFile = recal.recalFile + for ( r <- recal.rodBind ) { + if ( r.trackName.startsWith("input") ) { + this.rodBind :+= r + } + } + this.memoryLimit = Some(4) } trait EvalArgs extends VariantEval with ImplicitArgs { @@ -218,6 +224,7 @@ class Exome_VQSR_FullSearch extends QScript { this.evalModule = List("TiTvVariantEvaluator","CountVariants","GenotypeConcordance") this.rodBind :+= RodBind("dbsnp","VCF",DBSNP_129) this.rodBind :+= RodBind("compAxiom","VCF",AXIOM_CHIP) + this.memoryLimit = Some(4) } val extender = if ( ext != null ) ".cut%.1f."+ext else ".cut%.1f" diff --git a/scala/qscript/oneoffs/chartl/ScatterGatherAssociation.q b/scala/qscript/oneoffs/chartl/ScatterGatherAssociation.q index b8b150e54..f4990d258 100755 --- a/scala/qscript/oneoffs/chartl/ScatterGatherAssociation.q +++ b/scala/qscript/oneoffs/chartl/ScatterGatherAssociation.q @@ -80,6 +80,9 @@ class ScatterGatherAssociation extends QScript { @Output(doc="sd") @Gather(classOf[SimpleTextGatherFunction]) var sd : File = new File(String.format("%s.%s.%s",base,"SampleDepth",ext)) + @Output(doc="rli") + @Gather(classOf[SimpleTextGatherFunction]) + var rli : File = new File(String.format("%s.%s.%s",base,"ReadsLargeInsertSize",ext)) override def commandLine = { var bedStr : String = ""