diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationContext.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationContext.java index 3c241ca37..b3659f317 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationContext.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationContext.java @@ -64,6 +64,9 @@ public abstract class AssociationContext { window.add(sampleData); } + public void flush() { + window = new ArrayList>(size); + } public boolean isFull() { return window.size() >= size; @@ -78,4 +81,7 @@ public abstract class AssociationContext { public int getWindowSize() { return window.size(); } + + public boolean testOnFlush() { return true; } + public boolean canFlush() { return true; } } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationTestRunner.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationTestRunner.java index 4dfa8873e..dbc2ba339 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationTestRunner.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/AssociationTestRunner.java @@ -24,9 +24,17 @@ public class AssociationTestRunner { final static int MAX_Q_VALUE = Integer.MAX_VALUE; // todo -- this was written when ACs could implement interfaces, now that they extend, there's no multiple inheritance final static Dichotomizer1D.Transform LOG_TRANSFORM = (new Dichotomizer1D()).new Transform() { + private double EPSILON = 1e-20; @Override public double transform(double d) { - return Math.log(d); + return Math.log(d+EPSILON); + } + }; + + final static Dichotomizer1D.Transform ARCSINE_TRANSFORM = (new Dichotomizer1D()).new Transform() { + @Override + public double transform(double d) { + return Math.asin(1.0-d); } }; diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationHandler.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationHandler.java index c8ef6b7fd..9baa30e95 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationHandler.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationHandler.java @@ -15,10 +15,12 @@ public class RegionalAssociationHandler { private MapExtender maps; // todo -- the correct way to do this is via the PluginManager (a la VariantEval) but this is a QND implementation private Set associations; + private boolean bedGraphFormat; - public RegionalAssociationHandler(Set contexts, Set samples) { + public RegionalAssociationHandler(Set contexts, Set samples, boolean bedGraph) { maps = new MapExtender(samples); associations = contexts; + bedGraphFormat = bedGraph; } public void updateExtender(MapHolder mapHolder) { @@ -32,45 +34,75 @@ public class RegionalAssociationHandler { * @reduce: (List,AssociationContext) --> List * @implementation: just append */ - public void runMapReduce() { + public Map runMapReduce() { + Map testResults = new HashMap(); + if ( maps.getPreviousRef() != null && maps.getPreviousRef().getLocus().compareTo(getLocation()) != -1 ) { + testResults.putAll(flush()); + } for ( AssociationContext w : associations ) { if ( w.filter(maps) ) { w.addData(w.mapLocus(maps)); } + + if ( w.isFull() ) { + testResults.put(w,runTest(w)); + w.slide(); + } } + + return testResults; } /** * Switches what formatting to use based on wiggle or standard - * @param bedGraphFormat - use bedgraph format (s p q) or standard (S: s P: p Q: q) + * implicit param bedGraphFormat - use bedgraph format (s p q) or standard (S: s P: p Q: q) * @return - test results in proper format */ - public Map runTests(boolean bedGraphFormat) { + public String runTest(AssociationContext context) { // todo -- maybe the tdf should be the whole window rather than just the most recent loc? - Map testResults = new HashMap(associations.size()); - for ( AssociationContext w : associations ) { - if ( w.isFull() ) { - String outVal; - if ( bedGraphFormat ) { - Pair> statVals = AssociationTestRunner.getTestValues(w); - Pair simpleDichotVals = AssociationTestRunner.getDichotomizedValues(w); - outVal = String.format("%.2f\t%.2e\t%d\t%.2f\t%.2f",statVals.first,statVals.second.first,statVals.second.second, - simpleDichotVals.first,simpleDichotVals.second); - } else { - outVal = AssociationTestRunner.runTests(w); - Pair simpleDichotVals = AssociationTestRunner.getDichotomizedValues(w); - outVal += String.format("\tD: %.2f\tLogD: %.2f",simpleDichotVals.first,simpleDichotVals.second); - } - testResults.put(w,String.format("%s\t%d\t%d\t%s",maps.getReferenceContext().getLocus().getContig(), - maps.getReferenceContext().getLocus().getStart()-w.getWindowSize()-1,maps.getReferenceContext().getLocus().getStart()+1, outVal)); - w.slide(); - } + String outVal; + if ( bedGraphFormat ) { + Pair> statVals = AssociationTestRunner.getTestValues(context); + Pair simpleDichotVals = AssociationTestRunner.getDichotomizedValues(context); + outVal = String.format("%.2f\t%.2e\t%d\t%.2f\t%.2f",statVals.first,statVals.second.first,statVals.second.second, + simpleDichotVals.first,simpleDichotVals.second); + } else { + outVal = AssociationTestRunner.runTests(context); + Pair simpleDichotVals = AssociationTestRunner.getDichotomizedValues(context); + outVal += String.format("\tD: %.2f\tLogD: %.2f",simpleDichotVals.first,simpleDichotVals.second); } - return testResults; + return String.format("%s\t%d\t%d\t%s",maps.getReferenceContext().getLocus().getContig(), + maps.getReferenceContext().getLocus().getStart()-context.getWindowSize()-1,maps.getReferenceContext().getLocus().getStart()+1, outVal); + } public GenomeLoc getLocation() { return maps.getReferenceContext().getLocus(); } + /** + * Flushes context on a jump between intervals. Can not return null. + * @return on-flush tests + */ + public Map flush() { + Map flushTests = new HashMap(); + for ( AssociationContext ac : associations ) { + if ( ac.canFlush() ) { + if ( ac.testOnFlush() ) { + flushTests.put(ac,runTest(ac)); + } + ac.flush(); + } + } + + return flushTests; + } + + public Set getAssociations() { + return associations; + } + + public MapExtender getExtender() { + return maps; + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationWalker.java index 8495b54a2..9595861b0 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationWalker.java @@ -65,7 +65,7 @@ public class RegionalAssociationWalker extends LocusWalker validAssociations = getAssociations(); - RegionalAssociationHandler wac = new RegionalAssociationHandler(validAssociations,getSamples()); + RegionalAssociationHandler wac = new RegionalAssociationHandler(validAssociations,getSamples(),bedGraph); return wac; } @@ -76,14 +76,15 @@ public class RegionalAssociationWalker extends LocusWalker testResults; try { - rac.runMapReduce(); + testResults = rac.runMapReduce(); } catch (Exception e) { throw new StingException("Error in map reduce",e); } - Map testsHere = rac.runTests(bedGraph); - if ( testsHere.size() > 0 ) { - for ( Map.Entry result : testsHere.entrySet() ) { + + if ( testResults.size() > 0 ) { + for ( Map.Entry result : testResults.entrySet() ) { out.get(result.getKey().getClass()).printf("%s%n",result.getValue()); } } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/modules/casecontrol/ValueTest.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/modules/casecontrol/ValueTest.java index 47690c1f4..5082e6f2f 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/modules/casecontrol/ValueTest.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/modules/casecontrol/ValueTest.java @@ -46,9 +46,9 @@ public abstract class ValueTest extends CaseControl> implemen for ( Map> sampleMap : window ) { for ( Map.Entry> sampleEntry : sampleMap.entrySet() ) { if ( sampleEntry.getKey().getProperty("cohort").equals("case") ) { - caseMeans.add(MathUtils.average(sampleEntry.getValue())); + caseMeans.add(MathUtils.average(sampleEntry.getValue(),true)); } else if ( sampleEntry.getKey().getProperty("cohort").equals("control") ) { - controlMeans.add(MathUtils.average(sampleEntry.getValue())); + controlMeans.add(MathUtils.average(sampleEntry.getValue(),true)); } } } @@ -83,11 +83,11 @@ public abstract class ValueTest extends CaseControl> implemen if ( caseControlVectors == null || caseControlVectors.get(CaseControl.Cohort.CASE) == null || caseControlVectors.get(CaseControl.Cohort.CONTROL) == null ) { return new Pair(Double.NaN,Double.NaN); } - double meanCase = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CASE)); - double varCase = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CASE),meanCase); + double meanCase = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CASE),true); + double varCase = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CASE),meanCase,true); double nCase = caseControlVectors.get(CaseControl.Cohort.CASE).size(); - double meanControl = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CONTROL)); - double varControl = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CONTROL),meanControl); + double meanControl = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CONTROL),true); + double varControl = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CONTROL),meanControl,true); double nControl = caseControlVectors.get(CaseControl.Cohort.CONTROL).size(); double df_num = Math.pow(varCase/nCase + varControl/nControl,2); diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/statistics/Dichotomizer1D.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/statistics/Dichotomizer1D.java index 5f7df6146..82922d316 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/statistics/Dichotomizer1D.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/statistics/Dichotomizer1D.java @@ -41,11 +41,22 @@ public class Dichotomizer1D { * @return - the so-called "Z"-factor (effect size/spread) */ public static double simpleGaussianDichotomy(Collection setOne, Collection setTwo) { - double meanOne = MathUtils.sum(setOne)/setOne.size(); - double meanTwo = MathUtils.sum(setTwo)/setTwo.size(); - double stdOne = Math.sqrt(MathUtils.variance(setOne, meanOne)); - double stdTwo = Math.sqrt(MathUtils.variance(setTwo, meanTwo)); + double meanOne = MathUtils.average(setOne,true); + double meanTwo = MathUtils.average(setTwo,true); + double stdOne = Math.sqrt(MathUtils.variance(setOne, meanOne,true)); + double stdTwo = Math.sqrt(MathUtils.variance(setTwo, meanTwo,true)); + /* + System.out.print("setOne: "); + for ( Number n : setOne ) { + System.out.printf(",%.2f",n.doubleValue()); + } + System.out.print("\tsetTwo: "); + for ( Number n : setTwo ) { + System.out.printf(",%.2f",n.doubleValue()); + } + System.out.printf("\tmn1: %.2f mn2: %.2f var1: %.2f var2: %.2f%n",meanOne,meanTwo,stdOne,stdTwo); + */ return 1.0 - (3.0*(stdOne+stdTwo))/Math.abs(meanOne-meanTwo); } diff --git a/java/src/org/broadinstitute/sting/utils/MathUtils.java b/java/src/org/broadinstitute/sting/utils/MathUtils.java index 245755de1..639335b1e 100755 --- a/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -52,28 +52,56 @@ public class MathUtils { /** Private constructor. No instantiating this class! */ private MathUtils() {} - public static double sum( Collection numbers ) { + public static double sum(Collection numbers) { + return sum(numbers,false); + } + + public static double sum( Collection numbers, boolean ignoreNan ) { double sum = 0; for ( Number n : numbers ) { - sum += n.doubleValue(); + if ( ! ignoreNan || ! Double.isNaN(n.doubleValue())) { + sum += n.doubleValue(); + } } return sum; } - public static double average( Collection numbers ) { - return sum(numbers)/numbers.size(); + public static int nonNanSize(Collection numbers) { + int size = 0; + for ( Number n : numbers) { + size += Double.isNaN(n.doubleValue()) ? 0 : 1; + } + + return size; } - public static double variance( Collection numbers, Number mean ) { + public static double average( Collection numbers, boolean ignoreNan) { + if ( ignoreNan ) { + return sum(numbers,true)/nonNanSize(numbers); + } else { + return sum(numbers,false)/nonNanSize(numbers); + } + } + + public static double variance( Collection numbers, Number mean, boolean ignoreNan ) { double mn = mean.doubleValue(); double var = 0; - for ( Number n : numbers ) { var += Math.pow( n.doubleValue() - mn , 2); } - return var; + for ( Number n : numbers ) { var += ( ! ignoreNan || ! Double.isNaN(n.doubleValue())) ? (n.doubleValue()-mn)*(n.doubleValue()-mn) : 0; } + if ( ignoreNan ) { return var/nonNanSize(numbers); } + return var/numbers.size(); + } + + public static double variance(Collection numbers, Number mean) { + return variance(numbers,mean,false); + } + + public static double variance(Collection numbers, boolean ignoreNan) { + return variance(numbers,average(numbers,ignoreNan),ignoreNan); } public static double variance(Collection numbers) { - return variance(numbers,average(numbers)); + return variance(numbers,average(numbers,false),false); } public static double sum(double[] values) { diff --git a/scala/qscript/oneoffs/chartl/Exome_VQSR_FullSearch.q b/scala/qscript/oneoffs/chartl/Exome_VQSR_FullSearch.q index f7a9482fd..aad6ae70b 100755 --- a/scala/qscript/oneoffs/chartl/Exome_VQSR_FullSearch.q +++ b/scala/qscript/oneoffs/chartl/Exome_VQSR_FullSearch.q @@ -31,7 +31,7 @@ class Exome_VQSR_FullSearch extends QScript { var VQSR_RODBINDS : HashMap[String,List[RodBind]] = new HashMap[String,List[RodBind]] val VQSR_TAG_FT = "known=false,training=true,truth=%s,prior=%s" - val VQSR_DBSNP_TAG = "known=true,training=false,truth=false" + val VQSR_DBSNP_TAG = "known=true,training=false,truth=false,prior=0.1" for ( tf <- List( (true,false),(false,true),(true,true)) ) { @@ -69,7 +69,7 @@ class Exome_VQSR_FullSearch extends QScript { trait ExpandedIntervals extends CommandLineGATK { if (EXPAND_INTS > 0) { - this.intervals :+= ei.outList + this.intervals :+= ei.outList.getAbsoluteFile } } @@ -177,7 +177,7 @@ class Exome_VQSR_FullSearch extends QScript { exons.tranchesFile = new File(nameFormat.format("exons")+"tranche") exons.recalFile = new File(nameFormat.format("exons")+"recal") var flanks = new ContrastiveRecalibrator with VQSR_Args - flanks.intervals :+= ei.outList + flanks.intervals :+= ei.outList.getAbsoluteFile flanks.jarFile = GATK_JAR flanks.memoryLimit = Some(8) flanks.reference_sequence = REF @@ -211,6 +211,12 @@ class Exome_VQSR_FullSearch extends QScript { trait ApplyArgs extends ApplyRecalibration with ImplicitArgs { this.tranchesFile = recal.tranchesFile this.recalFile = recal.recalFile + for ( r <- recal.rodBind ) { + if ( r.trackName.startsWith("input") ) { + this.rodBind :+= r + } + } + this.memoryLimit = Some(4) } trait EvalArgs extends VariantEval with ImplicitArgs { @@ -218,6 +224,7 @@ class Exome_VQSR_FullSearch extends QScript { this.evalModule = List("TiTvVariantEvaluator","CountVariants","GenotypeConcordance") this.rodBind :+= RodBind("dbsnp","VCF",DBSNP_129) this.rodBind :+= RodBind("compAxiom","VCF",AXIOM_CHIP) + this.memoryLimit = Some(4) } val extender = if ( ext != null ) ".cut%.1f."+ext else ".cut%.1f" diff --git a/scala/qscript/oneoffs/chartl/ScatterGatherAssociation.q b/scala/qscript/oneoffs/chartl/ScatterGatherAssociation.q index b8b150e54..f4990d258 100755 --- a/scala/qscript/oneoffs/chartl/ScatterGatherAssociation.q +++ b/scala/qscript/oneoffs/chartl/ScatterGatherAssociation.q @@ -80,6 +80,9 @@ class ScatterGatherAssociation extends QScript { @Output(doc="sd") @Gather(classOf[SimpleTextGatherFunction]) var sd : File = new File(String.format("%s.%s.%s",base,"SampleDepth",ext)) + @Output(doc="rli") + @Gather(classOf[SimpleTextGatherFunction]) + var rli : File = new File(String.format("%s.%s.%s",base,"ReadsLargeInsertSize",ext)) override def commandLine = { var bedStr : String = ""