Old changes to the exome VQSR search.

SGA updated to include new proportion-based insert size test.

Major fix for dichotomization test: MathUtils now optionally ignores NaN values for sums, averages, variances. In the future this feature can be pushed back into the AssociationContext object iself (e.g. no data? no entry), but it's kept like this for transparency for now.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5618 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2011-04-12 23:00:50 +00:00
parent 30a19a00fe
commit 8125b8b901
9 changed files with 146 additions and 50 deletions

View File

@ -64,6 +64,9 @@ public abstract class AssociationContext<X,Y> {
window.add(sampleData);
}
public void flush() {
window = new ArrayList<Map<Sample,Y>>(size);
}
public boolean isFull() {
return window.size() >= size;
@ -78,4 +81,7 @@ public abstract class AssociationContext<X,Y> {
public int getWindowSize() {
return window.size();
}
public boolean testOnFlush() { return true; }
public boolean canFlush() { return true; }
}

View File

@ -24,9 +24,17 @@ public class AssociationTestRunner {
final static int MAX_Q_VALUE = Integer.MAX_VALUE;
// todo -- this was written when ACs could implement interfaces, now that they extend, there's no multiple inheritance
final static Dichotomizer1D.Transform LOG_TRANSFORM = (new Dichotomizer1D()).new Transform() {
private double EPSILON = 1e-20;
@Override
public double transform(double d) {
return Math.log(d);
return Math.log(d+EPSILON);
}
};
final static Dichotomizer1D.Transform ARCSINE_TRANSFORM = (new Dichotomizer1D()).new Transform() {
@Override
public double transform(double d) {
return Math.asin(1.0-d);
}
};

View File

@ -15,10 +15,12 @@ public class RegionalAssociationHandler {
private MapExtender maps;
// todo -- the correct way to do this is via the PluginManager (a la VariantEval) but this is a QND implementation
private Set<AssociationContext> associations;
private boolean bedGraphFormat;
public RegionalAssociationHandler(Set<AssociationContext> contexts, Set<Sample> samples) {
public RegionalAssociationHandler(Set<AssociationContext> contexts, Set<Sample> samples, boolean bedGraph) {
maps = new MapExtender(samples);
associations = contexts;
bedGraphFormat = bedGraph;
}
public void updateExtender(MapHolder mapHolder) {
@ -32,45 +34,75 @@ public class RegionalAssociationHandler {
* @reduce: (List<AssociationContext>,AssociationContext) --> List<AssociationContextAtom>
* @implementation: just append
*/
public void runMapReduce() {
public Map<AssociationContext,String> runMapReduce() {
Map<AssociationContext,String> testResults = new HashMap<AssociationContext,String>();
if ( maps.getPreviousRef() != null && maps.getPreviousRef().getLocus().compareTo(getLocation()) != -1 ) {
testResults.putAll(flush());
}
for ( AssociationContext w : associations ) {
if ( w.filter(maps) ) {
w.addData(w.mapLocus(maps));
}
if ( w.isFull() ) {
testResults.put(w,runTest(w));
w.slide();
}
}
return testResults;
}
/**
* Switches what formatting to use based on wiggle or standard
* @param bedGraphFormat - use bedgraph format (s p q) or standard (S: s P: p Q: q)
* implicit param bedGraphFormat - use bedgraph format (s p q) or standard (S: s P: p Q: q)
* @return - test results in proper format
*/
public Map<AssociationContext,String> runTests(boolean bedGraphFormat) {
public String runTest(AssociationContext context) {
// todo -- maybe the tdf should be the whole window rather than just the most recent loc?
Map<AssociationContext,String> testResults = new HashMap<AssociationContext,String>(associations.size());
for ( AssociationContext w : associations ) {
if ( w.isFull() ) {
String outVal;
if ( bedGraphFormat ) {
Pair<Double,Pair<Double,Integer>> statVals = AssociationTestRunner.getTestValues(w);
Pair<Double,Double> simpleDichotVals = AssociationTestRunner.getDichotomizedValues(w);
outVal = String.format("%.2f\t%.2e\t%d\t%.2f\t%.2f",statVals.first,statVals.second.first,statVals.second.second,
simpleDichotVals.first,simpleDichotVals.second);
} else {
outVal = AssociationTestRunner.runTests(w);
Pair<Double,Double> simpleDichotVals = AssociationTestRunner.getDichotomizedValues(w);
outVal += String.format("\tD: %.2f\tLogD: %.2f",simpleDichotVals.first,simpleDichotVals.second);
}
testResults.put(w,String.format("%s\t%d\t%d\t%s",maps.getReferenceContext().getLocus().getContig(),
maps.getReferenceContext().getLocus().getStart()-w.getWindowSize()-1,maps.getReferenceContext().getLocus().getStart()+1, outVal));
w.slide();
}
String outVal;
if ( bedGraphFormat ) {
Pair<Double,Pair<Double,Integer>> statVals = AssociationTestRunner.getTestValues(context);
Pair<Double,Double> simpleDichotVals = AssociationTestRunner.getDichotomizedValues(context);
outVal = String.format("%.2f\t%.2e\t%d\t%.2f\t%.2f",statVals.first,statVals.second.first,statVals.second.second,
simpleDichotVals.first,simpleDichotVals.second);
} else {
outVal = AssociationTestRunner.runTests(context);
Pair<Double,Double> simpleDichotVals = AssociationTestRunner.getDichotomizedValues(context);
outVal += String.format("\tD: %.2f\tLogD: %.2f",simpleDichotVals.first,simpleDichotVals.second);
}
return testResults;
return String.format("%s\t%d\t%d\t%s",maps.getReferenceContext().getLocus().getContig(),
maps.getReferenceContext().getLocus().getStart()-context.getWindowSize()-1,maps.getReferenceContext().getLocus().getStart()+1, outVal);
}
public GenomeLoc getLocation() {
return maps.getReferenceContext().getLocus();
}
/**
* Flushes context on a jump between intervals. Can not return null.
* @return on-flush tests
*/
public Map<AssociationContext,String> flush() {
Map<AssociationContext,String> flushTests = new HashMap<AssociationContext,String>();
for ( AssociationContext ac : associations ) {
if ( ac.canFlush() ) {
if ( ac.testOnFlush() ) {
flushTests.put(ac,runTest(ac));
}
ac.flush();
}
}
return flushTests;
}
public Set<AssociationContext> getAssociations() {
return associations;
}
public MapExtender getExtender() {
return maps;
}
}

View File

@ -65,7 +65,7 @@ public class RegionalAssociationWalker extends LocusWalker<MapHolder, RegionalAs
public RegionalAssociationHandler reduceInit() {
Set<AssociationContext> validAssociations = getAssociations();
RegionalAssociationHandler wac = new RegionalAssociationHandler(validAssociations,getSamples());
RegionalAssociationHandler wac = new RegionalAssociationHandler(validAssociations,getSamples(),bedGraph);
return wac;
}
@ -76,14 +76,15 @@ public class RegionalAssociationWalker extends LocusWalker<MapHolder, RegionalAs
public RegionalAssociationHandler reduce(MapHolder map, RegionalAssociationHandler rac) {
rac.updateExtender(map);
Map<AssociationContext,String> testResults;
try {
rac.runMapReduce();
testResults = rac.runMapReduce();
} catch (Exception e) {
throw new StingException("Error in map reduce",e);
}
Map<AssociationContext,String> testsHere = rac.runTests(bedGraph);
if ( testsHere.size() > 0 ) {
for ( Map.Entry<AssociationContext,String> result : testsHere.entrySet() ) {
if ( testResults.size() > 0 ) {
for ( Map.Entry<AssociationContext,String> result : testResults.entrySet() ) {
out.get(result.getKey().getClass()).printf("%s%n",result.getValue());
}
}

View File

@ -46,9 +46,9 @@ public abstract class ValueTest extends CaseControl<Collection<Number>> implemen
for ( Map<Sample,Collection<Number>> sampleMap : window ) {
for ( Map.Entry<Sample,Collection<Number>> sampleEntry : sampleMap.entrySet() ) {
if ( sampleEntry.getKey().getProperty("cohort").equals("case") ) {
caseMeans.add(MathUtils.average(sampleEntry.getValue()));
caseMeans.add(MathUtils.average(sampleEntry.getValue(),true));
} else if ( sampleEntry.getKey().getProperty("cohort").equals("control") ) {
controlMeans.add(MathUtils.average(sampleEntry.getValue()));
controlMeans.add(MathUtils.average(sampleEntry.getValue(),true));
}
}
}
@ -83,11 +83,11 @@ public abstract class ValueTest extends CaseControl<Collection<Number>> implemen
if ( caseControlVectors == null || caseControlVectors.get(CaseControl.Cohort.CASE) == null || caseControlVectors.get(CaseControl.Cohort.CONTROL) == null ) {
return new Pair<Double,Double>(Double.NaN,Double.NaN);
}
double meanCase = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CASE));
double varCase = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CASE),meanCase);
double meanCase = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CASE),true);
double varCase = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CASE),meanCase,true);
double nCase = caseControlVectors.get(CaseControl.Cohort.CASE).size();
double meanControl = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CONTROL));
double varControl = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CONTROL),meanControl);
double meanControl = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CONTROL),true);
double varControl = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CONTROL),meanControl,true);
double nControl = caseControlVectors.get(CaseControl.Cohort.CONTROL).size();
double df_num = Math.pow(varCase/nCase + varControl/nControl,2);

View File

@ -41,11 +41,22 @@ public class Dichotomizer1D {
* @return - the so-called "Z"-factor (effect size/spread)
*/
public static double simpleGaussianDichotomy(Collection<Number> setOne, Collection<Number> setTwo) {
double meanOne = MathUtils.sum(setOne)/setOne.size();
double meanTwo = MathUtils.sum(setTwo)/setTwo.size();
double stdOne = Math.sqrt(MathUtils.variance(setOne, meanOne));
double stdTwo = Math.sqrt(MathUtils.variance(setTwo, meanTwo));
double meanOne = MathUtils.average(setOne,true);
double meanTwo = MathUtils.average(setTwo,true);
double stdOne = Math.sqrt(MathUtils.variance(setOne, meanOne,true));
double stdTwo = Math.sqrt(MathUtils.variance(setTwo, meanTwo,true));
/*
System.out.print("setOne: ");
for ( Number n : setOne ) {
System.out.printf(",%.2f",n.doubleValue());
}
System.out.print("\tsetTwo: ");
for ( Number n : setTwo ) {
System.out.printf(",%.2f",n.doubleValue());
}
System.out.printf("\tmn1: %.2f mn2: %.2f var1: %.2f var2: %.2f%n",meanOne,meanTwo,stdOne,stdTwo);
*/
return 1.0 - (3.0*(stdOne+stdTwo))/Math.abs(meanOne-meanTwo);
}

View File

@ -52,28 +52,56 @@ public class MathUtils {
/** Private constructor. No instantiating this class! */
private MathUtils() {}
public static double sum( Collection<Number> numbers ) {
public static double sum(Collection<Number> numbers) {
return sum(numbers,false);
}
public static double sum( Collection<Number> numbers, boolean ignoreNan ) {
double sum = 0;
for ( Number n : numbers ) {
sum += n.doubleValue();
if ( ! ignoreNan || ! Double.isNaN(n.doubleValue())) {
sum += n.doubleValue();
}
}
return sum;
}
public static double average( Collection<Number> numbers ) {
return sum(numbers)/numbers.size();
public static int nonNanSize(Collection<Number> numbers) {
int size = 0;
for ( Number n : numbers) {
size += Double.isNaN(n.doubleValue()) ? 0 : 1;
}
return size;
}
public static double variance( Collection<Number> numbers, Number mean ) {
public static double average( Collection<Number> numbers, boolean ignoreNan) {
if ( ignoreNan ) {
return sum(numbers,true)/nonNanSize(numbers);
} else {
return sum(numbers,false)/nonNanSize(numbers);
}
}
public static double variance( Collection<Number> numbers, Number mean, boolean ignoreNan ) {
double mn = mean.doubleValue();
double var = 0;
for ( Number n : numbers ) { var += Math.pow( n.doubleValue() - mn , 2); }
return var;
for ( Number n : numbers ) { var += ( ! ignoreNan || ! Double.isNaN(n.doubleValue())) ? (n.doubleValue()-mn)*(n.doubleValue()-mn) : 0; }
if ( ignoreNan ) { return var/nonNanSize(numbers); }
return var/numbers.size();
}
public static double variance(Collection<Number> numbers, Number mean) {
return variance(numbers,mean,false);
}
public static double variance(Collection<Number> numbers, boolean ignoreNan) {
return variance(numbers,average(numbers,ignoreNan),ignoreNan);
}
public static double variance(Collection<Number> numbers) {
return variance(numbers,average(numbers));
return variance(numbers,average(numbers,false),false);
}
public static double sum(double[] values) {

View File

@ -31,7 +31,7 @@ class Exome_VQSR_FullSearch extends QScript {
var VQSR_RODBINDS : HashMap[String,List[RodBind]] = new HashMap[String,List[RodBind]]
val VQSR_TAG_FT = "known=false,training=true,truth=%s,prior=%s"
val VQSR_DBSNP_TAG = "known=true,training=false,truth=false"
val VQSR_DBSNP_TAG = "known=true,training=false,truth=false,prior=0.1"
for ( tf <- List( (true,false),(false,true),(true,true)) ) {
@ -69,7 +69,7 @@ class Exome_VQSR_FullSearch extends QScript {
trait ExpandedIntervals extends CommandLineGATK {
if (EXPAND_INTS > 0) {
this.intervals :+= ei.outList
this.intervals :+= ei.outList.getAbsoluteFile
}
}
@ -177,7 +177,7 @@ class Exome_VQSR_FullSearch extends QScript {
exons.tranchesFile = new File(nameFormat.format("exons")+"tranche")
exons.recalFile = new File(nameFormat.format("exons")+"recal")
var flanks = new ContrastiveRecalibrator with VQSR_Args
flanks.intervals :+= ei.outList
flanks.intervals :+= ei.outList.getAbsoluteFile
flanks.jarFile = GATK_JAR
flanks.memoryLimit = Some(8)
flanks.reference_sequence = REF
@ -211,6 +211,12 @@ class Exome_VQSR_FullSearch extends QScript {
trait ApplyArgs extends ApplyRecalibration with ImplicitArgs {
this.tranchesFile = recal.tranchesFile
this.recalFile = recal.recalFile
for ( r <- recal.rodBind ) {
if ( r.trackName.startsWith("input") ) {
this.rodBind :+= r
}
}
this.memoryLimit = Some(4)
}
trait EvalArgs extends VariantEval with ImplicitArgs {
@ -218,6 +224,7 @@ class Exome_VQSR_FullSearch extends QScript {
this.evalModule = List("TiTvVariantEvaluator","CountVariants","GenotypeConcordance")
this.rodBind :+= RodBind("dbsnp","VCF",DBSNP_129)
this.rodBind :+= RodBind("compAxiom","VCF",AXIOM_CHIP)
this.memoryLimit = Some(4)
}
val extender = if ( ext != null ) ".cut%.1f."+ext else ".cut%.1f"

View File

@ -80,6 +80,9 @@ class ScatterGatherAssociation extends QScript {
@Output(doc="sd")
@Gather(classOf[SimpleTextGatherFunction])
var sd : File = new File(String.format("%s.%s.%s",base,"SampleDepth",ext))
@Output(doc="rli")
@Gather(classOf[SimpleTextGatherFunction])
var rli : File = new File(String.format("%s.%s.%s",base,"ReadsLargeInsertSize",ext))
override def commandLine = {
var bedStr : String = ""