From 4c04c5a47a42f218ed4b2cec1c845da50a5c0bbc Mon Sep 17 00:00:00 2001 From: chartl Date: Mon, 28 Mar 2011 21:35:09 +0000 Subject: [PATCH] Addition of a BedTableCodec to allow for parsing of Bed-formatted tables (e.g. bedGraphs). Fixes for the recalibrator. Implementation of the data whitening input. Some TODOs in the RAW. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5529 348d0f76-0448-11de-a6fe-93d51630548a --- .../refdata/features/table/BedTableCodec.java | 34 ++++++ .../refdata/features/table/TableCodec.java | 12 +- .../RegionalAssociationRecalibrator.java | 112 ++++++++++++++++-- .../RegionalAssociationWalker.java | 3 + .../modules/ReferenceMismatches.java | 2 +- 5 files changed, 143 insertions(+), 20 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/gatk/refdata/features/table/BedTableCodec.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/table/BedTableCodec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/table/BedTableCodec.java new file mode 100755 index 000000000..b831606a3 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/table/BedTableCodec.java @@ -0,0 +1,34 @@ +package org.broadinstitute.sting.gatk.refdata.features.table; + +import org.broad.tribble.Feature; +import org.broad.tribble.readers.LineReader; +import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 3/28/11 + * Time: 2:47 PM + * To change this template use File | Settings | File Templates. + */ +/** + * The standard table codec with a slightly different parsing convention (expects loci as contig start stop, not contig:start-stop) + */ +public class BedTableCodec extends TableCodec implements ReferenceDependentFeatureCodec { + + @Override + public Feature decode(String line) { + if (line.startsWith(headerDelimiter) || line.startsWith(commentDelimiter) || line.startsWith(igvHeaderDelimiter)) + return null; + String[] split = line.split(delimiterRegex); + if (split.length < 1) + throw new IllegalArgumentException("TableCodec line = " + line + " doesn't appear to be a valid table format"); + return new TableFeature(genomeLocParser.createGenomeLoc(split[0],Integer.parseInt(split[1])-1,Integer.parseInt(split[2])), Arrays.asList(split),header); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java index 3e8d884be..6f0a712bf 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java @@ -16,16 +16,16 @@ import java.util.*; * implementation of a simple table (tab or comma delimited format) input files */ public class TableCodec implements ReferenceDependentFeatureCodec { - private String delimiterRegex = "\\s+"; - private String headerDelimiter = "HEADER"; - private String igvHeaderDelimiter = "track"; - private String commentDelimiter = "#"; - private ArrayList header = new ArrayList(); + protected String delimiterRegex = "\\s+"; + protected String headerDelimiter = "HEADER"; + protected String igvHeaderDelimiter = "track"; + protected String commentDelimiter = "#"; + protected ArrayList header = new ArrayList(); /** * The parser to use when resolving genome-wide locations. */ - private GenomeLocParser genomeLocParser; + protected GenomeLocParser genomeLocParser; /** * Set the parser to use when resolving genetic data. diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationRecalibrator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationRecalibrator.java index 39f357f59..4e6ab7785 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationRecalibrator.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationRecalibrator.java @@ -14,15 +14,21 @@ import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.features.table.BedTableCodec; import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.bed.BedParser; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.XReadLines; import java.io.File; +import java.io.FileNotFoundException; import java.io.PrintStream; import java.util.*; @@ -33,42 +39,56 @@ import java.util.*; * Time: 11:05 AM * To change this template use File | Settings | File Templates. */ -public class RegionalAssociationRecalibrator extends RodWalker> { +public class RegionalAssociationRecalibrator extends RodWalker> implements TreeReducible> { @Argument(shortName="w",fullName="whiteningFile",doc="File containing the mean, max vectors and covariance matrix",required=false) File whitenFile = null; @Argument(shortName="p",fullName="normParameter",doc="Exponent for vector norm; p > 4 will induce a p=infinity approximation",required=false) double pNorm = 1.0; - @Hidden - @Argument(shortName="cg",fullName="capacityGuess",doc="this is hidden",required = false) - int initialCapacity = 3500000; @Output public PrintStream out; private DataWhitener whitener; private double[] dataHolder; + private int boundTables = 0; public void initialize() { + + for ( ReferenceOrderedDataSource source : getToolkit().getRodDataSources() ) { + logger.debug(source.getType().getSimpleName()); + if ( source.getType().equals(BedTableCodec.class)) { + ++boundTables; + if ( ! source.getFile().getName().endsWith(".bedgraph") ) { + throw new UserException("Regional association requires bedgraph files. The file "+source.getFile().getAbsolutePath()+" does not have the proper extension."); + } + } + } + if ( whitenFile != null ) { whitener = getWhitenerFromFile(whitenFile); } } - public Set reduceInit() { return new HashSet(initialCapacity); } + public Set reduceInit() { return new TreeSet(); } public LocHashingPair map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext ali ) { if ( tracker == null ) { return null; } - ArrayList features = new ArrayList(7); + ArrayList features = new ArrayList(boundTables); - for ( GATKFeature feature : tracker.getAllRods() ) { + for ( GATKFeature feature : tracker.getAllRods()) { Object uo = feature.getUnderlyingObject(); - if ( uo instanceof TableFeature ) { + if ( uo instanceof TableFeature && feature.getLocation().getStart() == ref.getLocus().getStart()) { features.add((TableFeature)uo); } } + if ( features.size() == 0 ) { return null; } + if ( features.size() != boundTables ) { + throw new UserException("Features do not line up at position "+ref.getLocus().toString()); + } + if ( dataHolder == null ) { dataHolder = new double[features.size()]; } @@ -95,6 +115,17 @@ public class RegionalAssociationRecalibrator extends RodWalker treeReduce(Set left, Set right) { + if ( left == null ) { + return right; + } else if ( right == null ) { + return left; + } else { + left.addAll(right); + return left; + } + } + public void onTraversalDone(Set normByLoc) { double[] normRanks = new double[(normByLoc.size())]; int offset = 0; @@ -112,9 +143,49 @@ public class RegionalAssociationRecalibrator extends RodWalker { + class LocHashingPair extends Pair implements Comparable { public LocHashingPair(GenomeLoc g, Double d) { super(g,d); } public int hashCode() { return first.hashCode(); } + + public int compareTo(Object o) { + if ( o instanceof LocHashingPair ) { + return this.first.compareTo(((LocHashingPair) o).first); + } else { + return Integer.MIN_VALUE; + } + } } } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationWalker.java index d3f6b0585..9a2dab128 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/RegionalAssociationWalker.java @@ -26,6 +26,9 @@ import java.util.*; * @Author chartl * @Date 2011-02-23 * Generalized framework for regional (windowed) associations + * -- todos -- + * : todo -- + * Cap statistics output (sometimes see Infinity or -Infinity) [fixed cap, or by k*max_seen_so_far] */ @Downsample(by= DownsampleType.NONE) public class RegionalAssociationWalker extends LocusWalker implements TreeReducible { diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/modules/ReferenceMismatches.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/modules/ReferenceMismatches.java index 424f6bb8c..585a72c17 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/modules/ReferenceMismatches.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/association/modules/ReferenceMismatches.java @@ -43,6 +43,6 @@ public class ReferenceMismatches extends ZStatistic { } public int getWindowSize() { return 100; } - public int slideByValue() { return 25; } + public int slideByValue() { return 10; } public boolean usePreviouslySeenReads() { return true; } }