Old changes to the exome VQSR search.
SGA updated to include new proportion-based insert size test. Major fix for dichotomization test: MathUtils now optionally ignores NaN values for sums, averages, variances. In the future this feature can be pushed back into the AssociationContext object iself (e.g. no data? no entry), but it's kept like this for transparency for now. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5618 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
30a19a00fe
commit
8125b8b901
|
|
@ -64,6 +64,9 @@ public abstract class AssociationContext<X,Y> {
|
|||
window.add(sampleData);
|
||||
}
|
||||
|
||||
public void flush() {
|
||||
window = new ArrayList<Map<Sample,Y>>(size);
|
||||
}
|
||||
|
||||
public boolean isFull() {
|
||||
return window.size() >= size;
|
||||
|
|
@ -78,4 +81,7 @@ public abstract class AssociationContext<X,Y> {
|
|||
public int getWindowSize() {
|
||||
return window.size();
|
||||
}
|
||||
|
||||
public boolean testOnFlush() { return true; }
|
||||
public boolean canFlush() { return true; }
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,9 +24,17 @@ public class AssociationTestRunner {
|
|||
final static int MAX_Q_VALUE = Integer.MAX_VALUE;
|
||||
// todo -- this was written when ACs could implement interfaces, now that they extend, there's no multiple inheritance
|
||||
final static Dichotomizer1D.Transform LOG_TRANSFORM = (new Dichotomizer1D()).new Transform() {
|
||||
private double EPSILON = 1e-20;
|
||||
@Override
|
||||
public double transform(double d) {
|
||||
return Math.log(d);
|
||||
return Math.log(d+EPSILON);
|
||||
}
|
||||
};
|
||||
|
||||
final static Dichotomizer1D.Transform ARCSINE_TRANSFORM = (new Dichotomizer1D()).new Transform() {
|
||||
@Override
|
||||
public double transform(double d) {
|
||||
return Math.asin(1.0-d);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -15,10 +15,12 @@ public class RegionalAssociationHandler {
|
|||
private MapExtender maps;
|
||||
// todo -- the correct way to do this is via the PluginManager (a la VariantEval) but this is a QND implementation
|
||||
private Set<AssociationContext> associations;
|
||||
private boolean bedGraphFormat;
|
||||
|
||||
public RegionalAssociationHandler(Set<AssociationContext> contexts, Set<Sample> samples) {
|
||||
public RegionalAssociationHandler(Set<AssociationContext> contexts, Set<Sample> samples, boolean bedGraph) {
|
||||
maps = new MapExtender(samples);
|
||||
associations = contexts;
|
||||
bedGraphFormat = bedGraph;
|
||||
}
|
||||
|
||||
public void updateExtender(MapHolder mapHolder) {
|
||||
|
|
@ -32,45 +34,75 @@ public class RegionalAssociationHandler {
|
|||
* @reduce: (List<AssociationContext>,AssociationContext) --> List<AssociationContextAtom>
|
||||
* @implementation: just append
|
||||
*/
|
||||
public void runMapReduce() {
|
||||
public Map<AssociationContext,String> runMapReduce() {
|
||||
Map<AssociationContext,String> testResults = new HashMap<AssociationContext,String>();
|
||||
if ( maps.getPreviousRef() != null && maps.getPreviousRef().getLocus().compareTo(getLocation()) != -1 ) {
|
||||
testResults.putAll(flush());
|
||||
}
|
||||
for ( AssociationContext w : associations ) {
|
||||
if ( w.filter(maps) ) {
|
||||
w.addData(w.mapLocus(maps));
|
||||
}
|
||||
|
||||
if ( w.isFull() ) {
|
||||
testResults.put(w,runTest(w));
|
||||
w.slide();
|
||||
}
|
||||
}
|
||||
|
||||
return testResults;
|
||||
}
|
||||
|
||||
/**
|
||||
* Switches what formatting to use based on wiggle or standard
|
||||
* @param bedGraphFormat - use bedgraph format (s p q) or standard (S: s P: p Q: q)
|
||||
* implicit param bedGraphFormat - use bedgraph format (s p q) or standard (S: s P: p Q: q)
|
||||
* @return - test results in proper format
|
||||
*/
|
||||
public Map<AssociationContext,String> runTests(boolean bedGraphFormat) {
|
||||
public String runTest(AssociationContext context) {
|
||||
// todo -- maybe the tdf should be the whole window rather than just the most recent loc?
|
||||
Map<AssociationContext,String> testResults = new HashMap<AssociationContext,String>(associations.size());
|
||||
for ( AssociationContext w : associations ) {
|
||||
if ( w.isFull() ) {
|
||||
String outVal;
|
||||
if ( bedGraphFormat ) {
|
||||
Pair<Double,Pair<Double,Integer>> statVals = AssociationTestRunner.getTestValues(w);
|
||||
Pair<Double,Double> simpleDichotVals = AssociationTestRunner.getDichotomizedValues(w);
|
||||
outVal = String.format("%.2f\t%.2e\t%d\t%.2f\t%.2f",statVals.first,statVals.second.first,statVals.second.second,
|
||||
simpleDichotVals.first,simpleDichotVals.second);
|
||||
} else {
|
||||
outVal = AssociationTestRunner.runTests(w);
|
||||
Pair<Double,Double> simpleDichotVals = AssociationTestRunner.getDichotomizedValues(w);
|
||||
outVal += String.format("\tD: %.2f\tLogD: %.2f",simpleDichotVals.first,simpleDichotVals.second);
|
||||
}
|
||||
testResults.put(w,String.format("%s\t%d\t%d\t%s",maps.getReferenceContext().getLocus().getContig(),
|
||||
maps.getReferenceContext().getLocus().getStart()-w.getWindowSize()-1,maps.getReferenceContext().getLocus().getStart()+1, outVal));
|
||||
w.slide();
|
||||
}
|
||||
String outVal;
|
||||
if ( bedGraphFormat ) {
|
||||
Pair<Double,Pair<Double,Integer>> statVals = AssociationTestRunner.getTestValues(context);
|
||||
Pair<Double,Double> simpleDichotVals = AssociationTestRunner.getDichotomizedValues(context);
|
||||
outVal = String.format("%.2f\t%.2e\t%d\t%.2f\t%.2f",statVals.first,statVals.second.first,statVals.second.second,
|
||||
simpleDichotVals.first,simpleDichotVals.second);
|
||||
} else {
|
||||
outVal = AssociationTestRunner.runTests(context);
|
||||
Pair<Double,Double> simpleDichotVals = AssociationTestRunner.getDichotomizedValues(context);
|
||||
outVal += String.format("\tD: %.2f\tLogD: %.2f",simpleDichotVals.first,simpleDichotVals.second);
|
||||
}
|
||||
return testResults;
|
||||
return String.format("%s\t%d\t%d\t%s",maps.getReferenceContext().getLocus().getContig(),
|
||||
maps.getReferenceContext().getLocus().getStart()-context.getWindowSize()-1,maps.getReferenceContext().getLocus().getStart()+1, outVal);
|
||||
|
||||
}
|
||||
|
||||
public GenomeLoc getLocation() {
|
||||
return maps.getReferenceContext().getLocus();
|
||||
}
|
||||
|
||||
/**
|
||||
* Flushes context on a jump between intervals. Can not return null.
|
||||
* @return on-flush tests
|
||||
*/
|
||||
public Map<AssociationContext,String> flush() {
|
||||
Map<AssociationContext,String> flushTests = new HashMap<AssociationContext,String>();
|
||||
for ( AssociationContext ac : associations ) {
|
||||
if ( ac.canFlush() ) {
|
||||
if ( ac.testOnFlush() ) {
|
||||
flushTests.put(ac,runTest(ac));
|
||||
}
|
||||
ac.flush();
|
||||
}
|
||||
}
|
||||
|
||||
return flushTests;
|
||||
}
|
||||
|
||||
public Set<AssociationContext> getAssociations() {
|
||||
return associations;
|
||||
}
|
||||
|
||||
public MapExtender getExtender() {
|
||||
return maps;
|
||||
}
|
||||
}
|
||||
|
|
@ -65,7 +65,7 @@ public class RegionalAssociationWalker extends LocusWalker<MapHolder, RegionalAs
|
|||
|
||||
public RegionalAssociationHandler reduceInit() {
|
||||
Set<AssociationContext> validAssociations = getAssociations();
|
||||
RegionalAssociationHandler wac = new RegionalAssociationHandler(validAssociations,getSamples());
|
||||
RegionalAssociationHandler wac = new RegionalAssociationHandler(validAssociations,getSamples(),bedGraph);
|
||||
|
||||
return wac;
|
||||
}
|
||||
|
|
@ -76,14 +76,15 @@ public class RegionalAssociationWalker extends LocusWalker<MapHolder, RegionalAs
|
|||
|
||||
public RegionalAssociationHandler reduce(MapHolder map, RegionalAssociationHandler rac) {
|
||||
rac.updateExtender(map);
|
||||
Map<AssociationContext,String> testResults;
|
||||
try {
|
||||
rac.runMapReduce();
|
||||
testResults = rac.runMapReduce();
|
||||
} catch (Exception e) {
|
||||
throw new StingException("Error in map reduce",e);
|
||||
}
|
||||
Map<AssociationContext,String> testsHere = rac.runTests(bedGraph);
|
||||
if ( testsHere.size() > 0 ) {
|
||||
for ( Map.Entry<AssociationContext,String> result : testsHere.entrySet() ) {
|
||||
|
||||
if ( testResults.size() > 0 ) {
|
||||
for ( Map.Entry<AssociationContext,String> result : testResults.entrySet() ) {
|
||||
out.get(result.getKey().getClass()).printf("%s%n",result.getValue());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -46,9 +46,9 @@ public abstract class ValueTest extends CaseControl<Collection<Number>> implemen
|
|||
for ( Map<Sample,Collection<Number>> sampleMap : window ) {
|
||||
for ( Map.Entry<Sample,Collection<Number>> sampleEntry : sampleMap.entrySet() ) {
|
||||
if ( sampleEntry.getKey().getProperty("cohort").equals("case") ) {
|
||||
caseMeans.add(MathUtils.average(sampleEntry.getValue()));
|
||||
caseMeans.add(MathUtils.average(sampleEntry.getValue(),true));
|
||||
} else if ( sampleEntry.getKey().getProperty("cohort").equals("control") ) {
|
||||
controlMeans.add(MathUtils.average(sampleEntry.getValue()));
|
||||
controlMeans.add(MathUtils.average(sampleEntry.getValue(),true));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -83,11 +83,11 @@ public abstract class ValueTest extends CaseControl<Collection<Number>> implemen
|
|||
if ( caseControlVectors == null || caseControlVectors.get(CaseControl.Cohort.CASE) == null || caseControlVectors.get(CaseControl.Cohort.CONTROL) == null ) {
|
||||
return new Pair<Double,Double>(Double.NaN,Double.NaN);
|
||||
}
|
||||
double meanCase = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CASE));
|
||||
double varCase = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CASE),meanCase);
|
||||
double meanCase = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CASE),true);
|
||||
double varCase = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CASE),meanCase,true);
|
||||
double nCase = caseControlVectors.get(CaseControl.Cohort.CASE).size();
|
||||
double meanControl = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CONTROL));
|
||||
double varControl = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CONTROL),meanControl);
|
||||
double meanControl = MathUtils.average(caseControlVectors.get(CaseControl.Cohort.CONTROL),true);
|
||||
double varControl = MathUtils.variance(caseControlVectors.get(CaseControl.Cohort.CONTROL),meanControl,true);
|
||||
double nControl = caseControlVectors.get(CaseControl.Cohort.CONTROL).size();
|
||||
|
||||
double df_num = Math.pow(varCase/nCase + varControl/nControl,2);
|
||||
|
|
|
|||
|
|
@ -41,11 +41,22 @@ public class Dichotomizer1D {
|
|||
* @return - the so-called "Z"-factor (effect size/spread)
|
||||
*/
|
||||
public static double simpleGaussianDichotomy(Collection<Number> setOne, Collection<Number> setTwo) {
|
||||
double meanOne = MathUtils.sum(setOne)/setOne.size();
|
||||
double meanTwo = MathUtils.sum(setTwo)/setTwo.size();
|
||||
double stdOne = Math.sqrt(MathUtils.variance(setOne, meanOne));
|
||||
double stdTwo = Math.sqrt(MathUtils.variance(setTwo, meanTwo));
|
||||
double meanOne = MathUtils.average(setOne,true);
|
||||
double meanTwo = MathUtils.average(setTwo,true);
|
||||
double stdOne = Math.sqrt(MathUtils.variance(setOne, meanOne,true));
|
||||
double stdTwo = Math.sqrt(MathUtils.variance(setTwo, meanTwo,true));
|
||||
/*
|
||||
System.out.print("setOne: ");
|
||||
for ( Number n : setOne ) {
|
||||
System.out.printf(",%.2f",n.doubleValue());
|
||||
}
|
||||
System.out.print("\tsetTwo: ");
|
||||
for ( Number n : setTwo ) {
|
||||
System.out.printf(",%.2f",n.doubleValue());
|
||||
}
|
||||
|
||||
System.out.printf("\tmn1: %.2f mn2: %.2f var1: %.2f var2: %.2f%n",meanOne,meanTwo,stdOne,stdTwo);
|
||||
*/
|
||||
return 1.0 - (3.0*(stdOne+stdTwo))/Math.abs(meanOne-meanTwo);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -52,28 +52,56 @@ public class MathUtils {
|
|||
/** Private constructor. No instantiating this class! */
|
||||
private MathUtils() {}
|
||||
|
||||
public static double sum( Collection<Number> numbers ) {
|
||||
public static double sum(Collection<Number> numbers) {
|
||||
return sum(numbers,false);
|
||||
}
|
||||
|
||||
public static double sum( Collection<Number> numbers, boolean ignoreNan ) {
|
||||
double sum = 0;
|
||||
for ( Number n : numbers ) {
|
||||
sum += n.doubleValue();
|
||||
if ( ! ignoreNan || ! Double.isNaN(n.doubleValue())) {
|
||||
sum += n.doubleValue();
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
public static double average( Collection<Number> numbers ) {
|
||||
return sum(numbers)/numbers.size();
|
||||
public static int nonNanSize(Collection<Number> numbers) {
|
||||
int size = 0;
|
||||
for ( Number n : numbers) {
|
||||
size += Double.isNaN(n.doubleValue()) ? 0 : 1;
|
||||
}
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
public static double variance( Collection<Number> numbers, Number mean ) {
|
||||
public static double average( Collection<Number> numbers, boolean ignoreNan) {
|
||||
if ( ignoreNan ) {
|
||||
return sum(numbers,true)/nonNanSize(numbers);
|
||||
} else {
|
||||
return sum(numbers,false)/nonNanSize(numbers);
|
||||
}
|
||||
}
|
||||
|
||||
public static double variance( Collection<Number> numbers, Number mean, boolean ignoreNan ) {
|
||||
double mn = mean.doubleValue();
|
||||
double var = 0;
|
||||
for ( Number n : numbers ) { var += Math.pow( n.doubleValue() - mn , 2); }
|
||||
return var;
|
||||
for ( Number n : numbers ) { var += ( ! ignoreNan || ! Double.isNaN(n.doubleValue())) ? (n.doubleValue()-mn)*(n.doubleValue()-mn) : 0; }
|
||||
if ( ignoreNan ) { return var/nonNanSize(numbers); }
|
||||
return var/numbers.size();
|
||||
}
|
||||
|
||||
public static double variance(Collection<Number> numbers, Number mean) {
|
||||
return variance(numbers,mean,false);
|
||||
}
|
||||
|
||||
public static double variance(Collection<Number> numbers, boolean ignoreNan) {
|
||||
return variance(numbers,average(numbers,ignoreNan),ignoreNan);
|
||||
}
|
||||
|
||||
public static double variance(Collection<Number> numbers) {
|
||||
return variance(numbers,average(numbers));
|
||||
return variance(numbers,average(numbers,false),false);
|
||||
}
|
||||
|
||||
public static double sum(double[] values) {
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ class Exome_VQSR_FullSearch extends QScript {
|
|||
|
||||
var VQSR_RODBINDS : HashMap[String,List[RodBind]] = new HashMap[String,List[RodBind]]
|
||||
val VQSR_TAG_FT = "known=false,training=true,truth=%s,prior=%s"
|
||||
val VQSR_DBSNP_TAG = "known=true,training=false,truth=false"
|
||||
val VQSR_DBSNP_TAG = "known=true,training=false,truth=false,prior=0.1"
|
||||
|
||||
|
||||
for ( tf <- List( (true,false),(false,true),(true,true)) ) {
|
||||
|
|
@ -69,7 +69,7 @@ class Exome_VQSR_FullSearch extends QScript {
|
|||
|
||||
trait ExpandedIntervals extends CommandLineGATK {
|
||||
if (EXPAND_INTS > 0) {
|
||||
this.intervals :+= ei.outList
|
||||
this.intervals :+= ei.outList.getAbsoluteFile
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -177,7 +177,7 @@ class Exome_VQSR_FullSearch extends QScript {
|
|||
exons.tranchesFile = new File(nameFormat.format("exons")+"tranche")
|
||||
exons.recalFile = new File(nameFormat.format("exons")+"recal")
|
||||
var flanks = new ContrastiveRecalibrator with VQSR_Args
|
||||
flanks.intervals :+= ei.outList
|
||||
flanks.intervals :+= ei.outList.getAbsoluteFile
|
||||
flanks.jarFile = GATK_JAR
|
||||
flanks.memoryLimit = Some(8)
|
||||
flanks.reference_sequence = REF
|
||||
|
|
@ -211,6 +211,12 @@ class Exome_VQSR_FullSearch extends QScript {
|
|||
trait ApplyArgs extends ApplyRecalibration with ImplicitArgs {
|
||||
this.tranchesFile = recal.tranchesFile
|
||||
this.recalFile = recal.recalFile
|
||||
for ( r <- recal.rodBind ) {
|
||||
if ( r.trackName.startsWith("input") ) {
|
||||
this.rodBind :+= r
|
||||
}
|
||||
}
|
||||
this.memoryLimit = Some(4)
|
||||
}
|
||||
|
||||
trait EvalArgs extends VariantEval with ImplicitArgs {
|
||||
|
|
@ -218,6 +224,7 @@ class Exome_VQSR_FullSearch extends QScript {
|
|||
this.evalModule = List("TiTvVariantEvaluator","CountVariants","GenotypeConcordance")
|
||||
this.rodBind :+= RodBind("dbsnp","VCF",DBSNP_129)
|
||||
this.rodBind :+= RodBind("compAxiom","VCF",AXIOM_CHIP)
|
||||
this.memoryLimit = Some(4)
|
||||
}
|
||||
|
||||
val extender = if ( ext != null ) ".cut%.1f."+ext else ".cut%.1f"
|
||||
|
|
|
|||
|
|
@ -80,6 +80,9 @@ class ScatterGatherAssociation extends QScript {
|
|||
@Output(doc="sd")
|
||||
@Gather(classOf[SimpleTextGatherFunction])
|
||||
var sd : File = new File(String.format("%s.%s.%s",base,"SampleDepth",ext))
|
||||
@Output(doc="rli")
|
||||
@Gather(classOf[SimpleTextGatherFunction])
|
||||
var rli : File = new File(String.format("%s.%s.%s",base,"ReadsLargeInsertSize",ext))
|
||||
|
||||
override def commandLine = {
|
||||
var bedStr : String = ""
|
||||
|
|
|
|||
Loading…
Reference in New Issue