First pass at recalibrating associations, with optional data whitening. Modification to the TableCodec so it can natively read bedgraph files (just needed to add an extra header marker: "track").

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5515 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2011-03-25 19:35:39 +00:00
parent ac39f5532e
commit fe7f45ee2e
5 changed files with 200 additions and 4 deletions

View File

@ -18,6 +18,7 @@ import java.util.*;
public class TableCodec implements ReferenceDependentFeatureCodec {
private String delimiterRegex = "\\s+";
private String headerDelimiter = "HEADER";
private String igvHeaderDelimiter = "track";
private String commentDelimiter = "#";
private ArrayList<String> header = new ArrayList<String>();
@ -43,7 +44,7 @@ public class TableCodec implements ReferenceDependentFeatureCodec {
@Override
public Feature decode(String line) {
if (line.startsWith(headerDelimiter) || line.startsWith(commentDelimiter))
if (line.startsWith(headerDelimiter) || line.startsWith(commentDelimiter) || line.startsWith(igvHeaderDelimiter))
return null;
String[] split = line.split(delimiterRegex);
if (split.length < 1)

View File

@ -26,7 +26,7 @@ public class AssociationTestRunner {
// todo -- this was written when ACs could implement interfaces, now that they extend, there's no multiple inheritance
static Normal standardNormal = new Normal(0.0,1.0,null);
private static int pToQ(double p) {
public static int pToQ(double p) {
return Math.min((int) Math.floor(QualityUtils.phredScaleErrorRate(p)),MAX_Q_VALUE);
}

View File

@ -0,0 +1,177 @@
package org.broadinstitute.sting.oneoffprojects.walkers.association;
import cern.colt.function.DoubleDoubleFunction;
import cern.colt.matrix.DoubleMatrix1D;
import cern.colt.matrix.DoubleMatrix2D;
import cern.colt.matrix.impl.DenseDoubleMatrix1D;
import cern.colt.matrix.impl.DenseDoubleMatrix2D;
import cern.colt.matrix.linalg.Algebra;
import cern.colt.matrix.linalg.EigenvalueDecomposition;
import org.broad.tribble.bed.BEDFeature;
import org.broad.tribble.bed.SimpleBEDFeature;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.bed.BedParser;
import org.broadinstitute.sting.utils.collections.Pair;
import java.io.File;
import java.io.PrintStream;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 3/25/11
* Time: 11:05 AM
* To change this template use File | Settings | File Templates.
*/
public class RegionalAssociationRecalibrator extends RodWalker<RegionalAssociationRecalibrator.LocHashingPair,Set<RegionalAssociationRecalibrator.LocHashingPair>> {
@Argument(shortName="w",fullName="whiteningFile",doc="File containing the mean, max vectors and covariance matrix",required=false)
File whitenFile = null;
@Argument(shortName="p",fullName="normParameter",doc="Exponent for vector norm; p > 4 will induce a p=infinity approximation",required=false)
double pNorm = 1.0;
@Hidden
@Argument(shortName="cg",fullName="capacityGuess",doc="this is hidden",required = false)
int initialCapacity = 3500000;
@Output
public PrintStream out;
private DataWhitener whitener;
private double[] dataHolder;
public void initialize() {
if ( whitenFile != null ) {
whitener = getWhitenerFromFile(whitenFile);
}
}
public Set<LocHashingPair> reduceInit() { return new HashSet<LocHashingPair>(initialCapacity); }
public LocHashingPair map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext ali ) {
if ( tracker == null ) { return null; }
ArrayList<TableFeature> features = new ArrayList<TableFeature>(7);
for ( GATKFeature feature : tracker.getAllRods() ) {
Object uo = feature.getUnderlyingObject();
if ( uo instanceof TableFeature ) {
features.add((TableFeature)uo);
}
}
if ( dataHolder == null ) {
dataHolder = new double[features.size()];
}
int idx = 0;
for ( TableFeature f : features ) {
dataHolder[idx] = Double.parseDouble(f.getValue(3));
idx++;
}
double normVal;
if ( whitener != null ) {
normVal = calculateNorm(whitener.whiten(dataHolder),pNorm);
} else {
normVal = calculateNorm(dataHolder,pNorm);
}
return new LocHashingPair(features.get(0).getLocation(),normVal);
}
public Set<LocHashingPair> reduce(LocHashingPair map, Set<LocHashingPair> red) {
if ( map != null ) { red.add(map); }
return red;
}
public void onTraversalDone(Set<LocHashingPair> normByLoc) {
double[] normRanks = new double[(normByLoc.size())];
int offset = 0;
for ( LocHashingPair lhp : normByLoc ) {
normRanks[offset]=(lhp.second);
offset++;
}
Arrays.sort(normRanks);
for ( LocHashingPair lhp : normByLoc ) {
int rank = Arrays.binarySearch(normRanks,lhp.second);
double prob = ((double) rank)/((double)normRanks.length);
int qual = AssociationTestRunner.pToQ(prob);
out.printf("%s\t%d\t%d\t%d\t%.2e\t%d",lhp.getFirst().getContig(),lhp.getFirst().getStart(),lhp.getFirst().getStop(),
qual,prob,rank);
}
}
private static DataWhitener getWhitenerFromFile(File file) {
// todo -- implement me
return null;
}
private static double calculateNorm(double[] vec, double p) {
double sumExp = 0.0;
for ( double v : vec ) {
sumExp += Math.pow(Math.abs(v),p);
}
return Math.pow(sumExp,1.0/p);
}
class DataWhitener {
private final Algebra ALGEBRA = new Algebra();
private final DoubleDoubleFunction SUBTRACT = new DoubleDoubleFunction() {
@Override
public double apply(double v, double v1) {
return v - v1;
}
};
private DenseDoubleMatrix1D means;
private DenseDoubleMatrix1D maxAbs;
private DoubleMatrix2D transform;
public DataWhitener() {
}
public void setMeans(double[] inMeans) {
means = new DenseDoubleMatrix1D(inMeans);
}
public void setMaxs(double[] maxs) {
maxAbs = new DenseDoubleMatrix1D(maxs);
}
public void setCov(double[][] cov) {
EigenvalueDecomposition decomp = new EigenvalueDecomposition(new DenseDoubleMatrix2D(cov));
DoubleMatrix2D diag = decomp.getD();
for ( int i = 0; i < diag.size(); i ++ ) {
diag.set(i,i, Math.pow(diag.get(i,i),-0.5));
}
transform = ALGEBRA.mult(diag,ALGEBRA.transpose(decomp.getV()));
}
public double[] whiten(double[] value) {
DenseDoubleMatrix1D vec = new DenseDoubleMatrix1D(value);
return ALGEBRA.mult(transform, vec.assign(means, SUBTRACT) ).toArray();
}
}
class LocHashingPair extends Pair<GenomeLoc,Double> {
public LocHashingPair(GenomeLoc g, Double d) {
super(g,d);
}
public int hashCode() { return first.hashCode(); }
}
}

View File

@ -40,6 +40,13 @@ public class RegionalAssociationWalker extends LocusWalker<MapHolder, RegionalAs
public void initialize() {
for ( Sample s : getSamples() ) {
if ( s.getProperty("cohort") == null ) {
throw new UserException("Sample "+s.getId()+" does not have a cohort property associated with it. "+
"Please ensure that metadata is bound with -SM and that sample "+s.getId()+" has the cohort property assigned.");
}
}
Set<AssociationContext> validAssociations = getAssociations();
if ( bedGraph ) {

View File

@ -23,7 +23,7 @@ class ScatterGatherAssociation extends QScript {
@Argument(fullName="memoryLimit",shortName="M",doc="Memory limit for SG jobs",required=false)
var memLimit : Int = 4
@Argument(fullName="scatterJobs",shortName="SJ",doc="Number of scatter jobs",required=false)
var scatterJobs : Int = 75
var scatterJobs : Int = 125
val ASSOCIATION_TESTS = List("BaseQualityScore","InsertSizeDistribution","MappingQuality0",
"MateMappingQuality","MateOtherContig","MateSameStrand","MateUnmapped","MismatchRate",
@ -33,6 +33,9 @@ class ScatterGatherAssociation extends QScript {
class RegionalAssociationSG(base : String, ext : String) extends CommandLineGATK with ScatterGatherableFunction{
this.analysis_type = "RegionalAssociation"
@Argument(doc="useBed")
var useBed : Boolean = true
// the rest are output files implicitly constructed by the multiplexer
@Output(doc="bqs")
@ -78,7 +81,13 @@ class ScatterGatherAssociation extends QScript {
@Gather(classOf[SimpleTextGatherFunction])
var sd : File = new File(String.format("%s.%s.%s",base,"SampleDepth",ext))
override def commandLine = super.commandLine + " -AT ALL -o %s".format(base)
override def commandLine = {
var bedStr : String = ""
if ( useBed ) {
bedStr = " -bg "
}
super.commandLine + " -AT ALL -o %s%s".format(base,bedStr)
}
}
def script = {
@ -91,6 +100,8 @@ class ScatterGatherAssociation extends QScript {
}
var association = new RegionalAssociationSG(outBase,ext)
association.useBed = ! dontUseBedGraph
association.sample_metadata :+= metaData
association.intervals :+= intervalsFile
association.reference_sequence = referenceFile
association.jarFile = gatkJar