Addition of a BedTableCodec to allow for parsing of Bed-formatted tables (e.g. bedGraphs). Fixes for the recalibrator. Implementation of the data whitening input. Some TODOs in the RAW.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5529 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ccdc021207
commit
4c04c5a47a
|
|
@ -0,0 +1,34 @@
|
||||||
|
package org.broadinstitute.sting.gatk.refdata.features.table;
|
||||||
|
|
||||||
|
import org.broad.tribble.Feature;
|
||||||
|
import org.broad.tribble.readers.LineReader;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: 3/28/11
|
||||||
|
* Time: 2:47 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
/**
|
||||||
|
* The standard table codec with a slightly different parsing convention (expects loci as contig start stop, not contig:start-stop)
|
||||||
|
*/
|
||||||
|
public class BedTableCodec extends TableCodec implements ReferenceDependentFeatureCodec {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Feature decode(String line) {
|
||||||
|
if (line.startsWith(headerDelimiter) || line.startsWith(commentDelimiter) || line.startsWith(igvHeaderDelimiter))
|
||||||
|
return null;
|
||||||
|
String[] split = line.split(delimiterRegex);
|
||||||
|
if (split.length < 1)
|
||||||
|
throw new IllegalArgumentException("TableCodec line = " + line + " doesn't appear to be a valid table format");
|
||||||
|
return new TableFeature(genomeLocParser.createGenomeLoc(split[0],Integer.parseInt(split[1])-1,Integer.parseInt(split[2])), Arrays.asList(split),header);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -16,16 +16,16 @@ import java.util.*;
|
||||||
* implementation of a simple table (tab or comma delimited format) input files
|
* implementation of a simple table (tab or comma delimited format) input files
|
||||||
*/
|
*/
|
||||||
public class TableCodec implements ReferenceDependentFeatureCodec {
|
public class TableCodec implements ReferenceDependentFeatureCodec {
|
||||||
private String delimiterRegex = "\\s+";
|
protected String delimiterRegex = "\\s+";
|
||||||
private String headerDelimiter = "HEADER";
|
protected String headerDelimiter = "HEADER";
|
||||||
private String igvHeaderDelimiter = "track";
|
protected String igvHeaderDelimiter = "track";
|
||||||
private String commentDelimiter = "#";
|
protected String commentDelimiter = "#";
|
||||||
private ArrayList<String> header = new ArrayList<String>();
|
protected ArrayList<String> header = new ArrayList<String>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The parser to use when resolving genome-wide locations.
|
* The parser to use when resolving genome-wide locations.
|
||||||
*/
|
*/
|
||||||
private GenomeLocParser genomeLocParser;
|
protected GenomeLocParser genomeLocParser;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set the parser to use when resolving genetic data.
|
* Set the parser to use when resolving genetic data.
|
||||||
|
|
|
||||||
|
|
@ -14,15 +14,21 @@ import org.broadinstitute.sting.commandline.Hidden;
|
||||||
import org.broadinstitute.sting.commandline.Output;
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.features.table.BedTableCodec;
|
||||||
import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature;
|
import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature;
|
||||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.bed.BedParser;
|
import org.broadinstitute.sting.utils.bed.BedParser;
|
||||||
import org.broadinstitute.sting.utils.collections.Pair;
|
import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
|
|
@ -33,42 +39,56 @@ import java.util.*;
|
||||||
* Time: 11:05 AM
|
* Time: 11:05 AM
|
||||||
* To change this template use File | Settings | File Templates.
|
* To change this template use File | Settings | File Templates.
|
||||||
*/
|
*/
|
||||||
public class RegionalAssociationRecalibrator extends RodWalker<RegionalAssociationRecalibrator.LocHashingPair,Set<RegionalAssociationRecalibrator.LocHashingPair>> {
|
public class RegionalAssociationRecalibrator extends RodWalker<RegionalAssociationRecalibrator.LocHashingPair,Set<RegionalAssociationRecalibrator.LocHashingPair>> implements TreeReducible<Set<RegionalAssociationRecalibrator.LocHashingPair>> {
|
||||||
|
|
||||||
@Argument(shortName="w",fullName="whiteningFile",doc="File containing the mean, max vectors and covariance matrix",required=false)
|
@Argument(shortName="w",fullName="whiteningFile",doc="File containing the mean, max vectors and covariance matrix",required=false)
|
||||||
File whitenFile = null;
|
File whitenFile = null;
|
||||||
@Argument(shortName="p",fullName="normParameter",doc="Exponent for vector norm; p > 4 will induce a p=infinity approximation",required=false)
|
@Argument(shortName="p",fullName="normParameter",doc="Exponent for vector norm; p > 4 will induce a p=infinity approximation",required=false)
|
||||||
double pNorm = 1.0;
|
double pNorm = 1.0;
|
||||||
@Hidden
|
|
||||||
@Argument(shortName="cg",fullName="capacityGuess",doc="this is hidden",required = false)
|
|
||||||
int initialCapacity = 3500000;
|
|
||||||
|
|
||||||
@Output
|
@Output
|
||||||
public PrintStream out;
|
public PrintStream out;
|
||||||
|
|
||||||
private DataWhitener whitener;
|
private DataWhitener whitener;
|
||||||
private double[] dataHolder;
|
private double[] dataHolder;
|
||||||
|
private int boundTables = 0;
|
||||||
|
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
|
|
||||||
|
for ( ReferenceOrderedDataSource source : getToolkit().getRodDataSources() ) {
|
||||||
|
logger.debug(source.getType().getSimpleName());
|
||||||
|
if ( source.getType().equals(BedTableCodec.class)) {
|
||||||
|
++boundTables;
|
||||||
|
if ( ! source.getFile().getName().endsWith(".bedgraph") ) {
|
||||||
|
throw new UserException("Regional association requires bedgraph files. The file "+source.getFile().getAbsolutePath()+" does not have the proper extension.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if ( whitenFile != null ) {
|
if ( whitenFile != null ) {
|
||||||
whitener = getWhitenerFromFile(whitenFile);
|
whitener = getWhitenerFromFile(whitenFile);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public Set<LocHashingPair> reduceInit() { return new HashSet<LocHashingPair>(initialCapacity); }
|
public Set<LocHashingPair> reduceInit() { return new TreeSet<LocHashingPair>(); }
|
||||||
|
|
||||||
public LocHashingPair map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext ali ) {
|
public LocHashingPair map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext ali ) {
|
||||||
if ( tracker == null ) { return null; }
|
if ( tracker == null ) { return null; }
|
||||||
|
|
||||||
ArrayList<TableFeature> features = new ArrayList<TableFeature>(7);
|
ArrayList<TableFeature> features = new ArrayList<TableFeature>(boundTables);
|
||||||
|
|
||||||
for ( GATKFeature feature : tracker.getAllRods()) {
|
for ( GATKFeature feature : tracker.getAllRods()) {
|
||||||
Object uo = feature.getUnderlyingObject();
|
Object uo = feature.getUnderlyingObject();
|
||||||
if ( uo instanceof TableFeature ) {
|
if ( uo instanceof TableFeature && feature.getLocation().getStart() == ref.getLocus().getStart()) {
|
||||||
features.add((TableFeature)uo);
|
features.add((TableFeature)uo);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ( features.size() == 0 ) { return null; }
|
||||||
|
if ( features.size() != boundTables ) {
|
||||||
|
throw new UserException("Features do not line up at position "+ref.getLocus().toString());
|
||||||
|
}
|
||||||
|
|
||||||
if ( dataHolder == null ) {
|
if ( dataHolder == null ) {
|
||||||
dataHolder = new double[features.size()];
|
dataHolder = new double[features.size()];
|
||||||
}
|
}
|
||||||
|
|
@ -95,6 +115,17 @@ public class RegionalAssociationRecalibrator extends RodWalker<RegionalAssociati
|
||||||
return red;
|
return red;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Set<LocHashingPair> treeReduce(Set<LocHashingPair> left, Set<LocHashingPair> right) {
|
||||||
|
if ( left == null ) {
|
||||||
|
return right;
|
||||||
|
} else if ( right == null ) {
|
||||||
|
return left;
|
||||||
|
} else {
|
||||||
|
left.addAll(right);
|
||||||
|
return left;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void onTraversalDone(Set<LocHashingPair> normByLoc) {
|
public void onTraversalDone(Set<LocHashingPair> normByLoc) {
|
||||||
double[] normRanks = new double[(normByLoc.size())];
|
double[] normRanks = new double[(normByLoc.size())];
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
|
|
@ -112,9 +143,49 @@ public class RegionalAssociationRecalibrator extends RodWalker<RegionalAssociati
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static DataWhitener getWhitenerFromFile(File file) {
|
private DataWhitener getWhitenerFromFile(File file){
|
||||||
// todo -- implement me
|
XReadLines lineReader;
|
||||||
return null;
|
try {
|
||||||
|
lineReader = new XReadLines(file);
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
throw new UserException("The provided whiten file could not be found",e);
|
||||||
|
}
|
||||||
|
|
||||||
|
// first line is the mean vector
|
||||||
|
String[] meanLine = lineReader.next().split("\t");
|
||||||
|
if ( meanLine.length != boundTables ) {
|
||||||
|
String msg = String.format("The number of bound bedgraphs does not match the size of the mean value in the whitening file. bound: %d mean : %d",boundTables,meanLine.length);
|
||||||
|
throw new UserException(msg);
|
||||||
|
}
|
||||||
|
double[] mean = new double[meanLine.length];
|
||||||
|
for ( int i = 0 ; i < meanLine.length; i ++ ) {
|
||||||
|
mean[i] = Double.parseDouble(meanLine[i]);
|
||||||
|
}
|
||||||
|
// skip
|
||||||
|
lineReader.next();
|
||||||
|
// now the max
|
||||||
|
String[] maxLine = lineReader.next().split("\t");
|
||||||
|
double[] max = new double[maxLine.length];
|
||||||
|
for ( int i = 0; i < maxLine.length ; i++ ) {
|
||||||
|
max[i] = Double.parseDouble(maxLine[i]);
|
||||||
|
}
|
||||||
|
// skip
|
||||||
|
lineReader.next();
|
||||||
|
// now the covariance matrix
|
||||||
|
double[][] cov = new double[max.length][max.length];
|
||||||
|
for ( int i = 0; i < max.length; i ++ ) {
|
||||||
|
String[] row = lineReader.next().split("\t");
|
||||||
|
for ( int j = 0; j < max.length; j ++ ) {
|
||||||
|
cov[i][j] = Double.parseDouble(row[j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
DataWhitener dataWhitener = new DataWhitener();
|
||||||
|
dataWhitener.setCov(cov);
|
||||||
|
dataWhitener.setMaxs(max);
|
||||||
|
dataWhitener.setMeans(mean);
|
||||||
|
|
||||||
|
return dataWhitener;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static double calculateNorm(double[] vec, double p) {
|
private static double calculateNorm(double[] vec, double p) {
|
||||||
|
|
@ -130,8 +201,14 @@ public class RegionalAssociationRecalibrator extends RodWalker<RegionalAssociati
|
||||||
|
|
||||||
private final Algebra ALGEBRA = new Algebra();
|
private final Algebra ALGEBRA = new Algebra();
|
||||||
private final DoubleDoubleFunction SUBTRACT = new DoubleDoubleFunction() {
|
private final DoubleDoubleFunction SUBTRACT = new DoubleDoubleFunction() {
|
||||||
|
// note: want vectors that have NaN to have norm calculated within the
|
||||||
|
// n-k dimensional subspace. This is equivalent to setting the NaNs to
|
||||||
|
// a post-whitening value of zero.
|
||||||
@Override
|
@Override
|
||||||
public double apply(double v, double v1) {
|
public double apply(double v, double v1) {
|
||||||
|
if ( v == Double.NaN ) {
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
return v - v1;
|
return v - v1;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
@ -153,8 +230,9 @@ public class RegionalAssociationRecalibrator extends RodWalker<RegionalAssociati
|
||||||
|
|
||||||
public void setCov(double[][] cov) {
|
public void setCov(double[][] cov) {
|
||||||
EigenvalueDecomposition decomp = new EigenvalueDecomposition(new DenseDoubleMatrix2D(cov));
|
EigenvalueDecomposition decomp = new EigenvalueDecomposition(new DenseDoubleMatrix2D(cov));
|
||||||
|
// todo -- this bastard gets sorted, so we're going to have to match the mean, max, and data to this sorting (somehow?)
|
||||||
DoubleMatrix2D diag = decomp.getD();
|
DoubleMatrix2D diag = decomp.getD();
|
||||||
for ( int i = 0; i < diag.size(); i ++ ) {
|
for ( int i = 0; i < diag.rows(); i ++ ) {
|
||||||
diag.set(i,i, Math.pow(diag.get(i,i),-0.5));
|
diag.set(i,i, Math.pow(diag.get(i,i),-0.5));
|
||||||
}
|
}
|
||||||
transform = ALGEBRA.mult(diag,ALGEBRA.transpose(decomp.getV()));
|
transform = ALGEBRA.mult(diag,ALGEBRA.transpose(decomp.getV()));
|
||||||
|
|
@ -166,12 +244,20 @@ public class RegionalAssociationRecalibrator extends RodWalker<RegionalAssociati
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
class LocHashingPair extends Pair<GenomeLoc,Double> {
|
class LocHashingPair extends Pair<GenomeLoc,Double> implements Comparable {
|
||||||
|
|
||||||
public LocHashingPair(GenomeLoc g, Double d) {
|
public LocHashingPair(GenomeLoc g, Double d) {
|
||||||
super(g,d);
|
super(g,d);
|
||||||
}
|
}
|
||||||
|
|
||||||
public int hashCode() { return first.hashCode(); }
|
public int hashCode() { return first.hashCode(); }
|
||||||
|
|
||||||
|
public int compareTo(Object o) {
|
||||||
|
if ( o instanceof LocHashingPair ) {
|
||||||
|
return this.first.compareTo(((LocHashingPair) o).first);
|
||||||
|
} else {
|
||||||
|
return Integer.MIN_VALUE;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,9 @@ import java.util.*;
|
||||||
* @Author chartl
|
* @Author chartl
|
||||||
* @Date 2011-02-23
|
* @Date 2011-02-23
|
||||||
* Generalized framework for regional (windowed) associations
|
* Generalized framework for regional (windowed) associations
|
||||||
|
* -- todos --
|
||||||
|
* : todo --
|
||||||
|
* Cap statistics output (sometimes see Infinity or -Infinity) [fixed cap, or by k*max_seen_so_far]
|
||||||
*/
|
*/
|
||||||
@Downsample(by= DownsampleType.NONE)
|
@Downsample(by= DownsampleType.NONE)
|
||||||
public class RegionalAssociationWalker extends LocusWalker<MapHolder, RegionalAssociationHandler> implements TreeReducible<RegionalAssociationHandler> {
|
public class RegionalAssociationWalker extends LocusWalker<MapHolder, RegionalAssociationHandler> implements TreeReducible<RegionalAssociationHandler> {
|
||||||
|
|
|
||||||
|
|
@ -43,6 +43,6 @@ public class ReferenceMismatches extends ZStatistic {
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getWindowSize() { return 100; }
|
public int getWindowSize() { return 100; }
|
||||||
public int slideByValue() { return 25; }
|
public int slideByValue() { return 10; }
|
||||||
public boolean usePreviouslySeenReads() { return true; }
|
public boolean usePreviouslySeenReads() { return true; }
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue