major restructuring of generalized variant analysis framework. Now trivally easy to add additional analyses. Easy partitioning of all analyses by features, such as singleton status. Now has transition/transversional bias, counting, dbSNP coverage, HWE violation, selecting of variants by presence/absense in dbs. Also restructured the ROD system to make it easier to add tracks. Also, added the interval track -- if you provide an interval list, then the system autoatmically makese this available to you as a bound rod -- you can always find out where you are in the interval at every site. Python scripts improved to handle more merging, etc, into population snps.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@918 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-06-05 23:34:37 +00:00
parent 400399f1b8
commit 819862e04e
28 changed files with 846 additions and 316 deletions

View File

@ -8,12 +8,11 @@ import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RODIterator;
import org.broadinstitute.sting.gatk.refdata.IntervalRodIterator;
import org.broadinstitute.sting.gatk.traversals.*;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.cmdLine.ArgumentException;
import java.util.ArrayList;
@ -77,6 +76,8 @@ public class GenomeAnalysisEngine {
bindConvenienceRods("hapmap", "HapMapAlleleFrequencies", argCollection.HAPMAPFile);
if (argCollection.HAPMAPChipFile != null)
bindConvenienceRods("hapmap-chip", "GFF", argCollection.HAPMAPChipFile);
if ( argCollection.intervals != null )
bindConvenienceRods("interval", "Intervals", argCollection.intervals);
// parse out the rod bindings
ReferenceOrderedData.parseBindings(logger, argCollection.RODBindings, rods);
@ -114,7 +115,8 @@ public class GenomeAnalysisEngine {
genericEngineSetup(strictness);
// parse out any genomic location they've provided
List<GenomeLoc> locationsList = setupIntervalRegion();
//List<GenomeLoc> locationsList = setupIntervalRegion();
List<GenomeLoc> locationsList = engine.getLocations();
GenomeLocSortedSet locs = null;
if (locationsList != null)
locs = GenomeLocSortedSet.createSetFromList(locationsList);
@ -202,7 +204,7 @@ public class GenomeAnalysisEngine {
// we default interval files over the genome region string
if (argCollection.intervals != null) {
engine.setLocation(setupIntervalRegion());
engine.setLocation(parseIntervalRegion(argCollection.intervals, false));
}
engine.setReadFilters(new Reads(argCollection));
@ -217,15 +219,15 @@ public class GenomeAnalysisEngine {
*
* @return a list of genomeLoc representing the interval file
*/
private List<GenomeLoc> setupIntervalRegion() {
public static List<GenomeLoc> parseIntervalRegion(final String intervalsString, boolean quiet ) {
List<GenomeLoc> locs = null;
if (argCollection.intervals != null) {
if (new File(argCollection.intervals).exists()) {
logger.info("Intervals argument specifies a file. Loading intervals from file.");
locs = GenomeLoc.IntervalFileToList(argCollection.intervals);
if ( intervalsString != null) {
if (new File(intervalsString).exists()) {
if (! quiet) logger.info("Intervals argument specifies a file. Loading intervals from file.");
locs = GenomeLoc.IntervalFileToList(intervalsString);
} else {
logger.info("Intervals argument does not specify a file. Trying to parse it as a simple string.");
locs = GenomeLoc.parseGenomeLocs(argCollection.intervals);
if (! quiet) logger.info("Intervals argument does not specify a file. Trying to parse it as a simple string.");
locs = GenomeLoc.parseGenomeLocs(intervalsString);
}
}
return locs;

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.dataSources.providers;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.RODIterator;
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.utils.GenomeLoc;
@ -43,7 +44,7 @@ public class ReferenceOrderedView implements View {
public ReferenceOrderedView( ShardDataProvider provider ) {
this.provider = provider;
for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() )
states.add( new ReferenceOrderedDataState( dataSource, (ReferenceOrderedData.RODIterator)dataSource.seek(provider.getShard()) ) );
states.add( new ReferenceOrderedDataState( dataSource, (RODIterator)dataSource.seek(provider.getShard()) ) );
provider.register(this);
}
@ -78,9 +79,9 @@ public class ReferenceOrderedView implements View {
*/
private class ReferenceOrderedDataState {
public final ReferenceOrderedDataSource dataSource;
public final ReferenceOrderedData.RODIterator iterator;
public final RODIterator iterator;
public ReferenceOrderedDataState( ReferenceOrderedDataSource dataSource, ReferenceOrderedData.RODIterator iterator ) {
public ReferenceOrderedDataState( ReferenceOrderedDataSource dataSource, RODIterator iterator ) {
this.dataSource = dataSource;
this.iterator = iterator;
}

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.RODIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
@ -30,12 +31,12 @@ class IteratorPool {
/**
* All iterators of this reference-ordered data.
*/
private List<ReferenceOrderedData.RODIterator> allIterators = new ArrayList<ReferenceOrderedData.RODIterator>();
private List<RODIterator> allIterators = new ArrayList<RODIterator>();
/**
* All iterators that are not currently in service.
*/
private List<ReferenceOrderedData.RODIterator> availableIterators = new ArrayList<ReferenceOrderedData.RODIterator>();
private List<RODIterator> availableIterators = new ArrayList<RODIterator>();
/**
* Create a new iterator pool given the current ROD.
@ -50,10 +51,10 @@ class IteratorPool {
* @param position Target position for the iterator.
* @return
*/
public ReferenceOrderedData.RODIterator iterator( GenomeLoc position ) {
public RODIterator iterator( GenomeLoc position ) {
// Grab the first iterator in the list whose position is before the requested position.
ReferenceOrderedData.RODIterator selectedIterator = null;
for( ReferenceOrderedData.RODIterator iterator: availableIterators ) {
RODIterator selectedIterator = null;
for( RODIterator iterator: availableIterators ) {
if( (iterator.position() == null && iterator.hasNext()) ||
(iterator.position() != null && iterator.position().isBefore(position)) ) {
selectedIterator = iterator;
@ -79,7 +80,7 @@ class IteratorPool {
* Close the given iterator, returning it to the pool.
* @param iterator Iterator to return to the pool.
*/
public void close( ReferenceOrderedData.RODIterator iterator ) {
public void close( RODIterator iterator ) {
if( !allIterators.contains(iterator) )
throw new StingException("Iterator does not belong to the given pool.");
availableIterators.add(iterator);

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.RODIterator;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
@ -59,7 +60,7 @@ public class ReferenceOrderedDataSource implements SimpleDataSource {
* @return Iterator through the data.
*/
public Iterator seek( Shard shard ) {
ReferenceOrderedData.RODIterator iterator = iteratorPool.iterator(shard.getGenomeLoc());
RODIterator iterator = iteratorPool.iterator(shard.getGenomeLoc());
return iterator;
}
@ -67,7 +68,7 @@ public class ReferenceOrderedDataSource implements SimpleDataSource {
* Close the specified iterator, returning it to the pool.
* @param iterator Iterator to close.
*/
public void close( ReferenceOrderedData.RODIterator iterator ) {
public void close( RODIterator iterator ) {
this.iteratorPool.close(iterator);
}
}

View File

@ -0,0 +1,41 @@
package org.broadinstitute.sting.gatk.refdata;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import java.util.Iterator;
import java.io.File;
import java.io.IOException;
public class IntervalRod extends BasicReferenceOrderedDatum {
private GenomeLoc loc = null;
public IntervalRod(String name) {
super(name);
}
public IntervalRod(String name, GenomeLoc loc) {
super(name);
this.loc = loc;
}
public GenomeLoc getLocation() { return loc; }
/** Required by ReferenceOrderedDatum interface; this method does nothing (always returns false),
* since this rod provides its own iterator for reading underlying data files.
*/
public boolean parseLine(Object header, String[] parts) {
return false; // this rod has its own iterator
}
public String repl() {
throw new StingException("repl() is not implemented yet");
}
public String toSimpleString() { return loc.toString(); }
public String toString() { return toSimpleString(); }
public static IntervalRodIterator createIterator(String trackName, File f) throws IOException {
return IntervalRodIterator.IntervalRodIteratorFromLocsFile(trackName, f);
}
}

View File

@ -0,0 +1,52 @@
package org.broadinstitute.sting.gatk.refdata;
import java.io.File;
import java.io.IOException;
import java.util.*;
import org.broadinstitute.sting.gatk.refdata.BasicReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Pair;
public class IntervalRodIterator implements Iterator<IntervalRod> {
private List<GenomeLoc> locations = null;
private Iterator<GenomeLoc> iter;
private String trackName = null;
//public static RODIterator<IntervalRod> IntervalRodIteratorFromLocs(List<GenomeLoc> locs) {
// IntervalRodIterator it = new IntervalRodIterator("interval", locs);
// return new RODIterator<IntervalRod>(it);
//}
public static IntervalRodIterator IntervalRodIteratorFromLocsFile(final String trackName, final File file) {
//System.out.printf("Parsing %s for intervals %s%n", file, trackName);
List<GenomeLoc> locs = GenomeAnalysisEngine.parseIntervalRegion(file.getPath(), true);
//System.out.printf(" => got %d entries %n", locs.size());
return new IntervalRodIterator(trackName, locs);
}
public IntervalRodIterator(String trackName, List<GenomeLoc> locs) {
this.trackName = trackName;
locations = locs;
iter = locations.iterator();
}
@Override
public boolean hasNext() {
return iter.hasNext();
}
@Override
public IntervalRod next() {
IntervalRod r = new IntervalRod(trackName, iter.next());
//System.out.printf("IntervalRod next is %s%n", r);
return r;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}

View File

@ -22,7 +22,7 @@ import org.apache.log4j.Logger;
* chr1:1104842 A N 0.000000 -84.816002 -84.816002 0.000000 0.000000 324.000000 162 0 0
*
*/
public class PooledEMSNPROD extends TabularROD implements AllelicVariant {
public class PooledEMSNPROD extends TabularROD implements SNPCallFromGenotypes {
public PooledEMSNPROD(final String name) {
super(name);
}
@ -45,4 +45,11 @@ public class PooledEMSNPROD extends TabularROD implements AllelicVariant {
public List<String> getGenotype() throws IllegalStateException { throw new IllegalStateException(); }
public int getPloidy() throws IllegalStateException { return 2; }
public boolean isBiallelic() { return true; }
// SNPCallFromGenotypes interface
public int nIndividuals() { return Integer.parseInt(this.get("EM_N")); }
public int nHomRefGenotypes() { return Integer.parseInt(this.get("n_ref")); }
public int nHetGenotypes() { return Integer.parseInt(this.get("n_het")); }
public int nHomVarGenotypes() { return Integer.parseInt(this.get("n_hom")); }
public List<Genotype> getGenotypes() { return null; }
}

View File

@ -0,0 +1,95 @@
package org.broadinstitute.sting.gatk.refdata;
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.util.Iterator;
public class RODIterator<ROD extends ReferenceOrderedDatum> implements Iterator<ROD> {
private PushbackIterator<ROD> it;
private ROD current = null;
private GenomeLoc position = null;
public RODIterator(Iterator<ROD> it) {
this.it = new PushbackIterator<ROD>(it);
}
public boolean hasNext() { return it.hasNext(); }
public ROD next() {
ROD next = it.next();
if( next != null ) {
position = next.getLocation().clone();
current = next;
}
return next;
}
/**
* Returns the current position of this iterator.
* @return Current position of the iterator, or null if no position exists.
*/
public GenomeLoc position() {
return position;
}
/**
* Seeks forward in the file until we reach (or cross) a record at contig / pos
* If we don't find anything and cross beyond contig / pos, we return null
* Otherwise we return the first object who's start is at pos
*
* @param loc
* @return
*/
public ROD seekForward(final GenomeLoc loc) {
final boolean DEBUG = false;
ROD result = null;
//if (current != null && current.getName().equals("interval")) {
// boolean contains = current.getLocation().containsP(loc);
// System.out.printf(" %s : current is %s, seeking to %s, contains %b%n", current.getName(), current.getLocation(), loc, contains);
//}
if ( current != null && current.getLocation().containsP(loc) )
return current;
if ( DEBUG ) System.out.printf(" *** starting seek to %s %d%n", loc.getContig(), loc.getStart());
while ( hasNext() ) {
ROD proposed = next();
if( proposed == null )
continue;
//System.out.printf(" -> Seeking to %s %d AT %s %d%n", contigName, pos, current.getContig(), current.getStart());
boolean containedP = proposed.getLocation().containsP(loc);
//System.out.printf(" %s -> Seeking to %s, at %s => contains = %b%n", current.getName(), loc, current.getLocation(), containedP);
int cmp = proposed.getLocation().compareTo(loc);
if ( cmp < 0 ) {
// current occurs before loc, continue searching
continue;
}
else if ( cmp == 0 || containedP ) {
result = proposed;
break;
} else {
// current is after loc
it.pushback(proposed);
break;
}
}
if ( DEBUG ) {
if ( result != null )
System.out.printf(" ### Found %s%n", result.getLocation());
}
// make a note that the iterator last seeked to the specified position
current = result;
position = loc.clone();
// we ran out of elements or found something
return result;
}
public void remove() {
throw new UnsupportedOperationException();
}
}

View File

@ -94,7 +94,7 @@ public class RefMetaDataTracker {
* @param rod
*/
public void bind(final String name, ReferenceOrderedDatum rod) {
logger.debug(String.format("Binding %s to %s%n", name, rod));
//logger.debug(String.format("Binding %s to %s", name, rod));
map.put(canonicalName(name), rod);
}
}

View File

@ -66,6 +66,7 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
addModule("RefSeq", rodRefSeq.class);
addModule("Table", TabularROD.class);
addModule("PooledEM", PooledEMSNPROD.class);
addModule("Intervals", IntervalRod.class);
}
@ -159,7 +160,7 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
return this.name.equals(name) && type.isAssignableFrom(this.type);
}
public RODIterator iterator() {
public RODIterator<ROD> iterator() {
Iterator<ROD> it;
try {
Method m = type.getDeclaredMethod("createIterator", String.class,java.io.File.class);
@ -177,7 +178,7 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
} catch ( java.lang.reflect.InvocationTargetException e ) {
throw new RuntimeException(e);
}
return new RODIterator(it);
return new RODIterator<ROD>(it);
}
// ----------------------------------------------------------------------
@ -301,103 +302,6 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
}
}
// private class SimpleRODIterator implements Iterator<ROD> {
// //private WhitespaceTextFileParser parser = null;
// private TabbedTextFileParser parser = null;
//
// public SimpleRODIterator() {
// parser = new TabbedTextFileParser(true, file);
// }
//
// public boolean hasNext() {
// return parser.hasNext();
// }
//
// public ROD next() {
// String parts[] = parser.next();
// return parseLine(parts);
// }
//
// public void remove() {
// throw new UnsupportedOperationException();
// }
// }
public class RODIterator implements Iterator<ROD> {
private PushbackIterator<ROD> it;
private GenomeLoc position = null;
public RODIterator(Iterator<ROD> it) {
this.it = new PushbackIterator<ROD>(it);
}
public boolean hasNext() { return it.hasNext(); }
public ROD next() {
ROD next = it.next();
if( next != null )
position = next.getLocation().clone();
return next;
}
/**
* Returns the current position of this iterator.
* @return Current position of the iterator, or null if no position exists.
*/
public GenomeLoc position() {
return position;
}
/**
* Seeks forward in the file until we reach (or cross) a record at contig / pos
* If we don't find anything and cross beyond contig / pos, we return null
* Otherwise we return the first object who's start is at pos
*
* @param loc
* @return
*/
public ROD seekForward(final GenomeLoc loc) {
final boolean DEBUG = false;
ROD result = null;
if ( DEBUG ) System.out.printf(" *** starting seek to %s %d%n", loc.getContig(), loc.getStart());
while ( hasNext() ) {
ROD current = next();
if( current == null )
continue;
//System.out.printf(" -> Seeking to %s %d AT %s %d%n", contigName, pos, current.getContig(), current.getStart());
int cmp = current.getLocation().compareTo(loc);
if ( cmp < 0 ) {
// current occurs before loc, continue searching
continue;
}
else if ( cmp == 0 ) {
result = current;
break;
} else {
// current is after loc
it.pushback(current);
break;
}
}
if ( DEBUG ) {
if ( result != null )
System.out.printf(" ### Found %s%n", result.getLocation());
}
// make a note that the iterator last seeked to the specified position
position = loc.clone();
// we ran out of elements or found something
return result;
}
public void remove() {
throw new UnsupportedOperationException();
}
}
// ----------------------------------------------------------------------
//
// Parsing

View File

@ -0,0 +1,31 @@
package org.broadinstitute.sting.gatk.refdata;
import java.util.*;
import java.util.regex.MatchResult;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.xReadLines;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.apache.log4j.Logger;
/**
* loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom
* chr1:1104840 A N 0.000000 -85.341265 -85.341265 0.000000 0.000000 324.000000 162 0 0
* chr1:1104841 C N 0.000000 -69.937928 -69.937928 0.000000 0.000000 324.000000 162 0 0
* chr1:1104842 A N 0.000000 -84.816002 -84.816002 0.000000 0.000000 324.000000 162 0 0
*
*/
public interface SNPCallFromGenotypes extends AllelicVariant {
public int nIndividuals();
public int nHomRefGenotypes();
public int nHetGenotypes();
public int nHomVarGenotypes();
public List<Genotype> getGenotypes();
}

View File

@ -11,9 +11,7 @@ import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.RuntimeIOException;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.iterators.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.dataSources.providers.ShardDataProvider;
@ -22,7 +20,6 @@ import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
public abstract class TraversalEngine {
@ -30,7 +27,7 @@ public abstract class TraversalEngine {
protected List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods = null;
// Iterator over rods
List<Pair<String, ReferenceOrderedData<? extends ReferenceOrderedDatum>.RODIterator>> rodIters;
List<Pair<String, RODIterator<? extends ReferenceOrderedDatum>>> rodIters;
// How strict should we be with SAM/BAM parsing?
protected ValidationStringency strictness = ValidationStringency.STRICT;
@ -227,6 +224,10 @@ public abstract class TraversalEngine {
return this.locs != null && !this.locs.isEmpty();
}
public List<GenomeLoc> getLocations() {
return this.locs;
}
// --------------------------------------------------------------------------------------------------------------
//
// printing
@ -446,9 +447,9 @@ public abstract class TraversalEngine {
*/
protected void initializeRODs() {
// set up reference ordered data
rodIters = new ArrayList<Pair<String, ReferenceOrderedData<? extends ReferenceOrderedDatum>.RODIterator>>();
rodIters = new ArrayList<Pair<String, RODIterator<? extends ReferenceOrderedDatum>>>();
for (ReferenceOrderedData<? extends ReferenceOrderedDatum> data : rods) {
rodIters.add(new Pair<String, ReferenceOrderedData<? extends ReferenceOrderedDatum>.RODIterator>(data.getName(), data.iterator()));
rodIters.add(new Pair<String, RODIterator<? extends ReferenceOrderedDatum>>(data.getName(), data.iterator()));
}
}
@ -492,10 +493,11 @@ public abstract class TraversalEngine {
*/
protected RefMetaDataTracker getReferenceOrderedDataAtLocus(final GenomeLoc loc) {
RefMetaDataTracker tracks = new RefMetaDataTracker();
for (Pair<String, ReferenceOrderedData<? extends ReferenceOrderedDatum>.RODIterator> pair : rodIters) {
for (Pair<String, RODIterator<? extends ReferenceOrderedDatum>> pair : rodIters) {
String name = pair.getFirst();
tracks.bind(name, pair.getSecond().seekForward(loc));
}
return tracks;
}

View File

@ -0,0 +1,42 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.LocusContext;
import java.io.PrintStream;
import java.util.List;
import java.util.ArrayList;
public abstract class BasicVariantAnalysis implements VariantAnalysis {
protected String name;
protected PrintStream out;
protected VariantEvalWalker master;
public BasicVariantAnalysis(String name) {
this.name = name;
}
public String getName() {
return name;
}
public List<String> getParams() {
return new ArrayList<String>();
}
public void initialize(VariantEvalWalker master, PrintStream out) {
this.master = master;
this.out = out;
}
public PrintStream getPrintStream() {
return out;
}
public List<String> done() {
return new ArrayList<String>();
}
public abstract String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context);
}

View File

@ -0,0 +1,85 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.SNPCallFromGenotypes;
import org.broadinstitute.sting.gatk.refdata.PooledEMSNPROD;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;
import cern.jet.math.Arithmetic;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Jun 4, 2009
* Time: 4:38:00 PM
* To change this template use File | Settings | File Templates.
*/
public class HardyWeinbergEquilibrium extends BasicVariantAnalysis {
private double threshold;
int nSites = 0;
int nViolations = 0;
HardyWeinbergEquilibrium(double threshold) {
super("hwe");
this.threshold = threshold;
}
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
String r = null;
if ( eval != null &&
eval instanceof SNPCallFromGenotypes ) {
nSites++;
SNPCallFromGenotypes call = (SNPCallFromGenotypes)eval;
int nAA = call.nHomRefGenotypes();
int nAa = call.nHetGenotypes();
int naa = call.nHomVarGenotypes();
int nA = 2 * nAA + nAa;
int n = nAA + nAa + naa;
//
// from Emigh 1980
//
// P = pr[nAa | nA] = multinomial[n over nAA, nAa, naa] / binomial[2n over nA] * 2^nAa
//
// where nAA, nAa, naa are the observed numbers of the three genotypes, AA, Aa, and aa,
// respectively, and nA is the number of A alleles, where nA = 2nAA + nAa, and n is the number of alleles
//
int[] mXs = { nAA, nAa, naa }; // counts of each genotype as vector
double m = MathUtils.multinomial(mXs);
double b = Arithmetic.binomial(2 * n, nA);
double tosses = Math.pow(2, nAa);
double p = (m / b) * tosses;
if ( false ) {
System.out.printf("HWE-violation at %s %f < %f %1.2f %5d %5d %5d %5d %5d %.2e %.2e %.2e => %.6e [%s]%n",
call.getLocation(), p, threshold, call.getMAF(), nAA, nAa, naa, nA, n, m, b, tosses, p, eval);
System.out.printf("(factorial(%d) / (factorial(%d) * factorial(%d) * factorial(%d))) / choose(%d, %d) * 2^%d - %f < 1e-3%n",
nAA + nAa + naa, nAA, nAa, naa, 2 * n, nA, nAa, p);
}
if ( p < threshold ) {
r = String.format("HWE-violation %f < %f at %s", p, threshold, eval);
nViolations++;
}
}
return r;
}
public List<String> done() {
List<String> s = new ArrayList<String>();
s.add(String.format("n_calls %d", nSites));
s.add(String.format("n_violations %d", nViolations));
s.add(String.format("violations_rate %.2f", (100.0*nSites) / nViolations));
return s;
}
}

View File

@ -0,0 +1,86 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.IntervalRod;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.io.PrintStream;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Jun 4, 2009
* Time: 4:38:19 PM
* To change this template use File | Settings | File Templates.
*/
public class PairwiseDistanceAnalysis extends BasicVariantAnalysis {
ArrayList<Long> pairWiseDistances;
int[] pairWiseBoundries = {1, 2, 5, 10, 20, 50, 100, 1000, 10000};
AllelicVariant lastVariant = null;
GenomeLoc lastVariantInterval = null;
public PairwiseDistanceAnalysis() {
super("pairwise_distances");
pairWiseDistances = new ArrayList<Long>();
}
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
String r = null;
if ( eval != null ) {
IntervalRod intervalROD = (IntervalRod)tracker.lookup("interval", null);
GenomeLoc interval = intervalROD == null ? null : intervalROD.getLocation();
if (lastVariant != null) {
GenomeLoc eL = eval.getLocation();
GenomeLoc lvL = lastVariant.getLocation();
if (eL.getContigIndex() == lvL.getContigIndex()) {
long d = eL.distance(lvL);
if ( lastVariantInterval != null && lastVariantInterval.compareTo(interval) != 0) {
// we're on different intervals
//out.printf("# Excluding %d %s %s vs. %s %s%n", d, eL, interval, lvL, lastVariantInterval);
} else {
pairWiseDistances.add(d);
r = String.format("Pairwise-distance %d %s %s%n", d, eL, lvL);
}
}
}
lastVariant = eval;
lastVariantInterval = interval;
}
return r;
}
public List<String> done() {
int[] pairCounts = new int[pairWiseBoundries.length];
Arrays.fill(pairCounts, 0);
for ( long dist : pairWiseDistances ) {
boolean done = false;
for ( int i = 0; i < pairWiseBoundries.length && ! done ; i++ ) {
int maxDist = pairWiseBoundries[i];
if ( dist <= maxDist ) {
pairCounts[i]++;
done = true;
}
}
}
List<String> s = new ArrayList<String>();
s.add(String.format("snps counted for pairwise distance: %d", pairWiseDistances.size()));
for ( int i = 0; i < pairWiseBoundries.length; i++ ) {
int maxDist = pairWiseBoundries[i];
int count = pairCounts[i];
s.add(String.format("snps within %8d bp: %d", maxDist, count));
}
return s;
}
}

View File

@ -0,0 +1,74 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.utils.BaseUtils;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Jun 4, 2009
* Time: 4:38:00 PM
* To change this template use File | Settings | File Templates.
*/
public class TransitionTranversionAnalysis extends BasicVariantAnalysis {
int N_TRANSITION_TRANVERSION_BINS = 100;
Histogram<Integer> transitions;
Histogram<Integer> transversions;
public TransitionTranversionAnalysis() {
super("transitions_transversions");
transitions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
transversions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
}
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
if ( eval != null && eval.isSNP() ) {
char refBase = eval.getRefSnpFWD();
char altBase = eval.getAltSnpFWD();
//System.out.printf("%c %c%n", refBase, altBase);
//int i = transition_transversion_bin(dbsnp.getHeterozygosity());
//System.out.printf("MAF = %f => %d%n", dbsnp.getMAF(), i);
//EnumMap<BaseUtils.BaseSubstitutionType, Integer> bin = transition_transversion_counts[i];
BaseUtils.BaseSubstitutionType subType = BaseUtils.SNPSubstitutionType(refBase, altBase);
Histogram<Integer> h = subType == BaseUtils.BaseSubstitutionType.TRANSITION ? transitions : transversions;
double het = eval.getHeterozygosity();
h.setBin(het, h.getBin(het) + 1);
//int sit = bin.get(BaseUtils.BaseSubstitutionType.TRANSITION);
//int ver = bin.get(BaseUtils.BaseSubstitutionType.TRANSVERSION);
//System.out.printf("%s %.2f %s%n", subType, dbsnp.getHeterozygosity(), h.x2bin(logHet), dbsnp.toString());
}
return null;
}
public List<String> done() {
int nTransitions = 0;
int nTransversions = 0;
List<String> s = new ArrayList<String>();
for ( int i = 0; i < N_TRANSITION_TRANVERSION_BINS; i++ ) {
//double avHet = Math.pow(10, transitions.bin2x(i));
double avHet = transitions.bin2x(i);
if ( avHet > 0.5 ) break;
int sit = transitions.getBin(i);
int ver = transversions.getBin(i);
nTransitions += sit;
nTransversions += ver;
double ratio = (float)sit/ver;
//s.append(String.format("%s %s %.2f %d %d %f%n", commentLine, prefix, avHet, sit, ver, ratio));
}
s.add(String.format("transitions %d", nTransitions));
s.add(String.format("transversions %d", nTransversions));
s.add(String.format("ratio %.2f", nTransitions / (1.0 * nTransversions)));
return s;
}
}

View File

@ -0,0 +1,17 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.LocusContext;
import java.io.PrintStream;
import java.util.List;
public interface VariantAnalysis {
public String getName();
public PrintStream getPrintStream();
public List<String> getParams();
public void initialize(VariantEvalWalker master, PrintStream out);
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context);
public List<String> done();
}

View File

@ -0,0 +1,33 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.LocusContext;
import java.io.PrintStream;
import java.util.List;
import java.util.ArrayList;
public class VariantCounter extends BasicVariantAnalysis {
int nBasesCovered = 0;
int nSNPs = 0;
public VariantCounter() {
super("variant_counts");
}
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
nBasesCovered++;
nSNPs += eval == null ? 0 : 1;
return null;
}
public List<String> done() {
List<String> s = new ArrayList<String>();
s.add(String.format("n bases covered: %d", nBasesCovered));
s.add(String.format("sites: %d", nSNPs));
s.add(String.format("variant rate: %.5f confident variants per base", nSNPs / (1.0 * nBasesCovered)));
s.add(String.format("variant rate: 1 / %d confident variants per base", nBasesCovered / nSNPs));
return s;
}
}

View File

@ -1,12 +1,22 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
public class VariantDBCoverage {
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.LocusContext;
import java.io.PrintStream;
import java.util.List;
import java.util.Arrays;
import java.util.ArrayList;
public class VariantDBCoverage extends BasicVariantAnalysis {
private String dbName;
private int nDBObs = 0;
private int nEvalObs = 0;
private int nOverlapping = 0;
public VariantDBCoverage(final String name) {
super("db_coverage");
dbName = name;
}
@ -37,6 +47,13 @@ public class VariantDBCoverage {
return nOverlappingSites() / (1.0 * nEvalSites());
}
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
// There are four cases here:
AllelicVariant dbsnp = (AllelicVariant)tracker.lookup(dbName, null);
inc(dbsnp != null, eval != null);
return dbsnp == null && eval != null ? "Novel " + eval : null;
}
/**
* What fraction of the DB sites were discovered in the evalution calls?
*
@ -46,18 +63,15 @@ public class VariantDBCoverage {
return nOverlappingSites() / (1.0 * nDBSites());
}
public String toSingleLineString(final String prefix) {
return String.format("%s %d\t%d\t%d\t%.2f\t%.2f", prefix, nDBSites(), nEvalSites(), nOverlappingSites(), fractionEvalSitesCoveredByDB(), fractionDBSitesDiscoveredInEval());
}
public String toMultiLineString(final String prefix) {
StringBuilder s = new StringBuilder();
s.append(String.format("%s name %s%n", prefix, dbName));
s.append(String.format("%s n_db_sites %d%n", prefix, nDBSites()));
s.append(String.format("%s n_eval_sites %d%n", prefix, nEvalSites()));
s.append(String.format("%s n_overlapping_sites %d%n", prefix, nOverlappingSites()));
s.append(String.format("%s per_eval_sites_in_db %.2f%n", prefix, 100*fractionEvalSitesCoveredByDB()));
s.append(String.format("%s per_db_sites_in_eval %.2f%n", prefix, 100*fractionDBSitesDiscoveredInEval()));
return s.toString();
public List<String> done() {
List<String> s = new ArrayList<String>();
s.add(String.format("%d\t%d\t%d\t%.2f\t%.2f", nDBSites(), nEvalSites(), nOverlappingSites(), fractionEvalSitesCoveredByDB(), fractionDBSitesDiscoveredInEval()));
s.add(String.format("name %s", dbName));
s.add(String.format("n_db_sites %d", nDBSites()));
s.add(String.format("n_eval_sites %d", nEvalSites()));
s.add(String.format("n_overlapping_sites %d", nOverlappingSites()));
s.add(String.format("per_eval_sites_in_db %.2f", 100*fractionEvalSitesCoveredByDB()));
s.add(String.format("per_db_sites_in_eval %.2f", 100*fractionDBSitesDiscoveredInEval()));
return s;
}
}

View File

@ -1,24 +1,16 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.EnumMap;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.*;
import java.io.*;
/**
* Created by IntelliJ IDEA.
* User: mdepristo
* Date: Feb 22, 2009
* Time: 2:52:28 PM
* To change this template use File | Settings | File Templates.
*/
@By(DataSource.REFERENCE)
@Requires(DataSource.REFERENCE)
@Allows(DataSource.REFERENCE)
@ -29,110 +21,114 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
@Argument(shortName="printVariants", doc="If true, prints the variants in all of the variant tracks that are examined", required=false)
public boolean printVariants = false;
int nBasesCovered = 0;
int nSNPs = 0;
VariantDBCoverage dbSNPStats = new VariantDBCoverage("dbSNP");
@Argument(shortName="badHWEThreshold", doc="XXX", required=false)
public double badHWEThreshold = 0.001;
int N_TRANSITION_TRANVERSION_BINS = 100;
Histogram<Integer> transitions;
Histogram<Integer> transversions;
String analysisFilenameBase = null;
@Argument(shortName="outputFilenameBase", doc="", required=false)
String outputFilenameBase = null;
String COMMENT_STRING = "";
ArrayList<Long> pairWiseDistances;
int[] pairWiseBoundries = {1, 10, 100, 1000, 10000, 1000000000};
AllelicVariant lastVariant = null;
HashMap<String, ArrayList<VariantAnalysis>> analysisSets;
//EnumMap<BaseUtils.BaseSubstitutionType, Integer> transition_transversion_counts[];
final String ALL_SNPS = "all";
final String SINGLETON_SNPS = "singletons";
final String TWOHIT_SNPS = "2plus_hit";
final String[] ALL_ANALYSIS_NAMES = { ALL_SNPS, SINGLETON_SNPS, TWOHIT_SNPS };
public void initialize() {
//transitions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, Math.log10(1e-10), 0.0, 0);
//transversions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, Math.log10(1e-10), 0.0, 0);
transitions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
transversions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
pairWiseDistances = new ArrayList<Long>();
// setup the path to the analysis
if ( this.getToolkit().getArguments().outFileName != null ) {
analysisFilenameBase = this.getToolkit().getArguments().outFileName + ".analysis.";
}
analysisSets = new HashMap<String, ArrayList<VariantAnalysis>>();
for ( String setName : ALL_ANALYSIS_NAMES ) {
analysisSets.put(setName, initializeAnalysisSet(setName));
}
}
//private int transition_transversion_bin(double het) {
// return (int)Math.floor(het * N_TRANSITION_TRANVERSION_BINS);
//}
private ArrayList<VariantAnalysis> getAnalysisSet(final String name) {
return analysisSets.get(name);
}
//private double transition_transversion_bin2het(int bin) {
// return (bin + 0.5) / N_TRANSITION_TRANVERSION_BINS;
//}
private ArrayList<VariantAnalysis> initializeAnalysisSet(final String setName) {
ArrayList<VariantAnalysis> analyses = new ArrayList<VariantAnalysis>();
//
// Add new analyzes here!
//
analyses.add(new VariantCounter());
analyses.add(new VariantDBCoverage("dbSNP"));
analyses.add(new TransitionTranversionAnalysis());
analyses.add(new PairwiseDistanceAnalysis());
analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold));
if ( printVariants ) analyses.add(new VariantMatcher("dbSNP"));
for ( VariantAnalysis analysis : analyses ) {
analysis.initialize(this, openAnalysisOutputStream(setName, analysis));
}
return analyses;
}
/**
* Returns the filename of the analysis output file where output for an analysis with
* @param name
* @param params
* @return
*/
public String getAnalysisFilename(final String name, final List<String> params) {
if ( analysisFilenameBase == null )
return null;
else
return analysisFilenameBase + Utils.join(".", Utils.cons(name, params));
}
public PrintStream openAnalysisOutputStream(final String setName, VariantAnalysis analysis) {
final String filename = getAnalysisFilename(setName + "." + analysis.getName(), analysis.getParams());
if ( filename == null )
return out;
else {
File file = new File(filename);
try {
return new PrintStream(new FileOutputStream(file));
} catch (FileNotFoundException e) {
throw new StingException("Couldn't open analysis output file " + filename, e);
}
}
}
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
nBasesCovered++;
AllelicVariant dbsnp = (AllelicVariant)tracker.lookup("dbSNP", null);
// Iterate over each analysis, and update it
AllelicVariant eval = (AllelicVariant)tracker.lookup("eval", null);
if ( eval != null && eval.getVariationConfidence() < minDiscoveryQ )
eval = null;
if ( printVariants && ( eval != null || dbsnp != null ) ) {
String matchFlag = " ";
if ( eval != null && dbsnp != null ) matchFlag = "*** ";
if ( eval == null && dbsnp != null ) matchFlag = ">>> ";
if ( eval != null && dbsnp == null ) matchFlag = "<<< ";
updateAnalysisSet(ALL_SNPS, eval, tracker, ref, context);
System.out.printf("%sDBSNP: %s%n%sEVAL:%s%n",
matchFlag, dbsnp,
matchFlag, eval);
if ( eval instanceof SNPCallFromGenotypes ) {
SNPCallFromGenotypes call = (PooledEMSNPROD)eval;
int nVarGenotypes = call.nHetGenotypes() + call.nHomVarGenotypes();
//System.out.printf("%d variant genotypes at %s%n", nVarGenotypes, calls);
final String s = nVarGenotypes == 1 ? SINGLETON_SNPS : TWOHIT_SNPS;
updateAnalysisSet(s, eval, tracker, ref, context);
}
if ( eval != null ) {
//System.out.printf("GREP ME%n");
if ( ! eval.isSNP() || eval.getVariationConfidence() < minDiscoveryQ )
System.out.printf("We have a problem at %s: %b %f%n", eval, eval.isSNP(), eval.getVariationConfidence());
}
if ( eval != null && eval.isSNP() && eval.getVariationConfidence() >= minDiscoveryQ ) {
//System.out.printf("%s has: %nDBSNP: %s%nEVAL:%s%n", context.getLocation(), dbsnp, eval);
//System.out.printf("N snps %d = %s%n", ++nSNPs, eval.getLocation());
updateTransitionTransversion(eval, ref, context);
if (lastVariant != null) updatePairwiseDistances(eval, lastVariant);
lastVariant = eval;
//updateHapMapRate(dbsnp, eval, ref, context);
}
updateVariantDBCoverage(dbsnp, eval, ref, context);
return 1;
}
private void updatePairwiseDistances(AllelicVariant eval, AllelicVariant lastVariant) {
GenomeLoc eL = eval.getLocation();
GenomeLoc lvL = lastVariant.getLocation();
if (eL.getContigIndex() == lvL.getContigIndex()) {
long d = eL.distance(lvL);
pairWiseDistances.add(d);
out.printf("# Pairwise-distance %d %s %s%n", d, eL, lvL);
public void updateAnalysisSet(final String analysisSetName, AllelicVariant eval,
RefMetaDataTracker tracker, char ref, LocusContext context) {
// Iterate over each analysis, and update it
for ( VariantAnalysis analysis : getAnalysisSet(analysisSetName) ) {
String s = analysis.update(eval, tracker, ref, context);
if ( s != null ) analysis.getPrintStream().println(s);
}
}
private void updateTransitionTransversion(AllelicVariant dbsnp, char ref, LocusContext context) {
char refBase = dbsnp.getRefSnpFWD();
char altBase = dbsnp.getAltSnpFWD();
//System.out.printf("%c %c%n", refBase, altBase);
//int i = transition_transversion_bin(dbsnp.getHeterozygosity());
//System.out.printf("MAF = %f => %d%n", dbsnp.getMAF(), i);
//EnumMap<BaseUtils.BaseSubstitutionType, Integer> bin = transition_transversion_counts[i];
BaseUtils.BaseSubstitutionType subType = BaseUtils.SNPSubstitutionType(refBase, altBase);
Histogram<Integer> h = subType == BaseUtils.BaseSubstitutionType.TRANSITION ? transitions : transversions;
double het = dbsnp.getHeterozygosity();
h.setBin(het, h.getBin(het) + 1);
//int sit = bin.get(BaseUtils.BaseSubstitutionType.TRANSITION);
//int ver = bin.get(BaseUtils.BaseSubstitutionType.TRANSVERSION);
//System.out.printf("%s %.2f %s%n", subType, dbsnp.getHeterozygosity(), h.x2bin(logHet), dbsnp.toString());
}
private void updateVariantDBCoverage(AllelicVariant dbsnp, AllelicVariant eval, char ref, LocusContext context) {
// There are four cases here:
dbSNPStats.inc(dbsnp != null, eval != null);
}
// Given result of map function
public Integer reduceInit() { return 0; }
public Integer reduce(Integer value, Integer sum) {
@ -143,54 +139,24 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
}
public void onTraversalDone(Integer result) {
int nTransitions = 0;
int nTransversions = 0;
StringBuilder s = new StringBuilder();
for ( int i = 0; i < N_TRANSITION_TRANVERSION_BINS; i++ ) {
//double avHet = Math.pow(10, transitions.bin2x(i));
double avHet = transitions.bin2x(i);
if ( avHet > 0.5 ) break;
int sit = transitions.getBin(i);
int ver = transversions.getBin(i);
nTransitions += sit;
nTransversions += ver;
double ratio = (float)sit/ver;
s.append(String.format("%.2f %d %d %f%n", avHet, sit, ver, ratio));
for ( String analysisSetName : ALL_ANALYSIS_NAMES ) {
printAnalysisSet(analysisSetName);
}
out.printf("# n bases covered: %d%n", nBasesCovered);
out.printf("# sites: %d%n", nSNPs);
out.printf("# variant rate: %.5f confident variants per base%n", nSNPs / (1.0 * nBasesCovered));
out.printf("# variant rate: 1 / %d confident variants per base%n", nBasesCovered / nSNPs);
out.printf("# transitions: %d%n", nTransitions);
out.printf("# transversions: %d%n", nTransversions);
out.printf("# ratio: %.2f%n", nTransitions / (1.0 * nTransversions));
}
// dbSNP stats
out.println(dbSNPStats.toSingleLineString("#"));
out.print(dbSNPStats.toMultiLineString("#"));
int[] pairCounts = new int[pairWiseBoundries.length];
Arrays.fill(pairCounts, 0);
for ( long dist : pairWiseDistances ) {
boolean done = false;
for ( int i = 0; i < pairWiseBoundries.length && ! done ; i++ ) {
int maxDist = pairWiseBoundries[i];
if ( dist < maxDist ) {
pairCounts[i]++;
done = true;
}
private void printAnalysisSet( final String analysisSetName ) {
Date now = new Date();
for ( VariantAnalysis analysis : getAnalysisSet(analysisSetName) ) {
PrintStream stream = analysis.getPrintStream(); // getAnalysisOutputStream(analysisSetName + "." + analysis.getName(), analysis.getParams());
stream.printf("%s%s%n", COMMENT_STRING, Utils.dupString('-', 78));
stream.printf("%sAnalysis set %s%n", COMMENT_STRING, analysisSetName);
stream.printf("%sAnalysis name %s%n", COMMENT_STRING, analysis.getName());
stream.printf("%sAnalysis params %s%n", COMMENT_STRING, Utils.join(" ", analysis.getParams()));
stream.printf("%sAnalysis class %s%n", COMMENT_STRING, analysis );
stream.printf("%sAnalysis time %s%n", COMMENT_STRING, now );
for ( String line : analysis.done()) {
stream.printf("%s %s%n", COMMENT_STRING, line);
}
}
out.printf("# snps counted for pairwise distance: %d%n", pairWiseDistances.size());
for ( int i = 0; i < pairWiseBoundries.length; i++ ) {
int maxDist = pairWiseBoundries[i];
int count = pairCounts[i];
out.printf("# snps within %d bp: %d%n", maxDist, count);
}
out.print(s);
}
}

View File

@ -0,0 +1,34 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.LocusContext;
import java.io.PrintStream;
import java.util.List;
import java.util.Arrays;
import java.util.ArrayList;
public class VariantMatcher extends BasicVariantAnalysis {
String dbName;
public VariantMatcher(final String name) {
super("variant_matches");
dbName = name;
}
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
String r = null;
AllelicVariant db = (AllelicVariant)tracker.lookup(dbName, null);
if ( eval != null || db != null ) {
String matchFlag = " ";
if ( eval != null && db != null ) matchFlag = "*** ";
if ( eval == null && db != null ) matchFlag = ">>> ";
if ( eval != null && db == null ) matchFlag = "<<< ";
r = String.format("%s %s: %s <=> EVAL: %s", dbName, matchFlag, db, eval);
}
return r;
}
}

View File

@ -84,6 +84,35 @@ public class MathUtils {
//return (new cern.jet.random.Binomial(n, p, cern.jet.random.engine.RandomEngine.makeDefault())).pdf(k);
}
/**
* Computes a multinomial. This is computed using the formula
*
* M(x1,x2,...,xk; n) = [ n! / (x1! x2! ... xk!) ]
*
* where xi represents the number of times outcome i was observed, n is the number of total observations.
* In this implementation, the value of n is inferred as the sum over i of xi.
*
* @param x an int[] of counts, where each element represents the number of times a certain outcome was observed
* @return the multinomial of the specified configuration.
*/
public static double multinomial(int[] x) {
// In order to avoid overflow in computing large factorials in the multinomial
// coefficient, we split the calculation up into the product of a bunch of
// binomial coefficients.
double multinomialCoefficient = 1.0;
for (int i = 0; i < x.length; i++) {
int n = 0;
for (int j = 0; j <= i; j++) { n += x[j]; }
double multinomialTerm = Arithmetic.binomial(n, x[i]);
multinomialCoefficient *= multinomialTerm;
}
return multinomialCoefficient;
}
/**
* Computes a multinomial probability. This is computed using the formula
*
@ -101,16 +130,7 @@ public class MathUtils {
// In order to avoid overflow in computing large factorials in the multinomial
// coefficient, we split the calculation up into the product of a bunch of
// binomial coefficients.
double multinomialCoefficient = 1.0;
for (int i = 0; i < x.length; i++) {
int n = 0;
for (int j = 0; j <= i; j++) { n += x[j]; }
double multinomialTerm = Arithmetic.binomial(n, x[i]);
multinomialCoefficient *= multinomialTerm;
}
double multinomialCoefficient = multinomial(x);
double probs = 1.0, totalprob = 0.0;
for (int obsCountsIndex = 0; obsCountsIndex < x.length; obsCountsIndex++) {

View File

@ -43,6 +43,13 @@ public class Utils {
throw new RuntimeException(msg);
}
public static <T> List<T> cons(final T elt, final List<T> l) {
List<T> l2 = new ArrayList<T>();
l2.add(elt);
if ( l != null ) l2.addAll(l);
return l2;
}
/**
* pretty print the warning message supplied
* @param message the message

View File

@ -11,6 +11,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.TabularROD;
import org.broadinstitute.sting.gatk.refdata.RODIterator;
import java.io.File;
import java.io.FileNotFoundException;
@ -56,7 +57,7 @@ public class IteratorPoolTest extends BaseTest {
@Test
public void testCreateSingleIterator() {
IteratorPool iteratorPool = new IteratorPool(rod);
ReferenceOrderedData.RODIterator iterator = (ReferenceOrderedData.RODIterator)iteratorPool.iterator( testSite1 );
RODIterator iterator = (RODIterator)iteratorPool.iterator( testSite1 );
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators());
@ -77,10 +78,10 @@ public class IteratorPoolTest extends BaseTest {
@Test
public void testCreateMultipleIterators() {
IteratorPool iteratorPool = new IteratorPool(rod);
ReferenceOrderedData.RODIterator iterator1 = (ReferenceOrderedData.RODIterator)iteratorPool.iterator( testSite1 );
RODIterator iterator1 = (RODIterator)iteratorPool.iterator( testSite1 );
// Create a new iterator at position 2.
ReferenceOrderedData.RODIterator iterator2 = iteratorPool.iterator( testSite2 );
RODIterator iterator2 = iteratorPool.iterator( testSite2 );
Assert.assertEquals("Number of iterators in the pool is incorrect", 2, iteratorPool.numIterators());
Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators());
@ -127,7 +128,7 @@ public class IteratorPoolTest extends BaseTest {
@Test
public void testIteratorConservation() {
IteratorPool iteratorPool = new IteratorPool(rod);
ReferenceOrderedData.RODIterator iterator = (ReferenceOrderedData.RODIterator)iteratorPool.iterator( testSite1 );
RODIterator iterator = (RODIterator)iteratorPool.iterator( testSite1 );
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators());
@ -162,7 +163,7 @@ public class IteratorPoolTest extends BaseTest {
@Test
public void testIteratorCreation() {
IteratorPool iteratorPool = new IteratorPool(rod);
ReferenceOrderedData.RODIterator iterator = (ReferenceOrderedData.RODIterator)iteratorPool.iterator( testSite3 );
RODIterator iterator = (RODIterator)iteratorPool.iterator( testSite3 );
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators());

View File

@ -26,7 +26,7 @@ import java.util.ArrayList;
public class TabularRODTest extends BaseTest {
private static FastaSequenceFile2 seq;
private ReferenceOrderedData ROD;
private ReferenceOrderedData.RODIterator iter;
private RODIterator iter;
@BeforeClass
@ -113,7 +113,7 @@ public class TabularRODTest extends BaseTest {
public void testDelim1() {
File file2 = new File(testDir + "TabularDataTest2.dat");
ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", file2, TabularROD.class);
ReferenceOrderedData.RODIterator iter_commas = ROD_commas.iterator();
RODIterator iter_commas = ROD_commas.iterator();
logger.warn("Executing testDelim1");
TabularROD one2 = (TabularROD)iter_commas.next();
@ -130,7 +130,7 @@ public class TabularRODTest extends BaseTest {
TabularROD.setDelimiter(",",",");
File file2 = new File(testDir + "TabularDataTest2.dat");
ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", file2, TabularROD.class);
ReferenceOrderedData.RODIterator iter_commas = ROD_commas.iterator();
RODIterator iter_commas = ROD_commas.iterator();
logger.warn("Executing testDelim1");
TabularROD one2 = (TabularROD)iter_commas.next();
@ -173,7 +173,7 @@ public class TabularRODTest extends BaseTest {
out.println(row.toString());
ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", outputFile, TabularROD.class);
ReferenceOrderedData.RODIterator iter_commas = ROD_commas.iterator();
RODIterator iter_commas = ROD_commas.iterator();
TabularROD one = (TabularROD)iter_commas.next();
assertTrue(one.size() == 3);

View File

@ -63,6 +63,7 @@ def main():
if filter(lambda x: x <> 0, jobids) <> []:
# there's still work to do
sys.exit('Stopping. Please rerun this program when the farm jobs are complete: ' + str(jobids))
# TODO: Should add a wait here for all farm jobs to finish...
print 'gelis', gelis
print 'jobids', jobids
else:
@ -76,8 +77,11 @@ def main():
dbsnpFile = geli2dbsnpFile(geli)
if not os.path.exists(dbsnpFile):
dbsnpCmd = picard_utils.CollectDbSnpMatchesCmd(geli, dbsnpFile, OPTIONS.lod)
if jobid == 0: jobid = None
farm_commands.cmd(dbsnpCmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid)
# TODO: Should add a wait here for all farm jobs to finish...
# read in the dbSNP tracks
nTotalSnps = 0
nNovelSnps = 0
@ -99,11 +103,12 @@ def main():
jobid = None
variantsOut = map( lambda geli: os.path.split(geli)[1] + '.calls', gelis)
for geli, variantOut in zip(gelis, variantsOut):
name = os.path.split(geli)[1]
if not os.path.exists(variantOut):
cmd = ("GeliToText.jar I=%s | awk '$7 > %f' > %s" % ( geli, OPTIONS.lod, variantOut) )
cmd = ("GeliToText.jar I=%s | awk '$1 !~ \"@\" && $1 !~ \"#Sequence\" && $0 !~ \"GeliToText\"' | awk '$7 > %f {print \"%s\" $0}' > %s" % ( geli, OPTIONS.lod, name, variantOut) )
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry)
cmd = ("cat %s | awk '$1 !~ \"@\" && $1 !~ \"#Sequence\" && $0 !~ \"GeliToText\"' | sort -k 1 -k 2 -n > tmp.calls" % ( ' '.join(variantsOut) ) )
cmd = ("cat %s | sort -k 1 -k 2 -n > tmp.calls" % ( ' '.join(variantsOut) ) )
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid)
sortedCallFile = 'all.sorted.calls'
@ -113,14 +118,14 @@ def main():
sortedCalls = [line.split() for line in open(sortedCallFile)]
aggregratedCalls = picard_utils.aggregateGeliCalls(sortedCalls)
print >> outputFile, 'loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom'
print >> outputFile, 'loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom individuals'
for snp in map( lambda x: picard_utils.aggregatedGeliCalls2SNP(x, nIndividuals), aggregratedCalls ):
if snp == None: continue # ignore bad calls
#print snp
#sharedCalls = list(sharedCallsGroup)
#genotype = list(sharedCalls[0][5])
print >> outputFile, '%s %s %s %.6f -420.0 -420.0 0.000000 100.0 %d %d %d %d' % (snp.loc, snp.ref, snp.alt(), snp.q(), nIndividuals, snp.nRefGenotypes(), snp.nHetGenotypes(), snp.nHomVarGenotypes())
print >> outputFile, '%s %s %s %.6f -420.0 -420.0 0.000000 100.0 %d %d %d %d NA' % (snp.loc, snp.ref, snp.alt(), snp.q(), nIndividuals, snp.nRefGenotypes(), snp.nHetGenotypes(), snp.nHomVarGenotypes())
outputFile.close()

View File

@ -24,7 +24,7 @@ def cmd(cmd_str_from_user, farm_queue=False, output_head=None, just_print_comman
if waitID <> None:
cmd_str += " -w \"done(%s)\"" % (str(waitID))
cmd_str += " "+cmd_str_from_user
cmd_str += " \""+cmd_str_from_user + "\""
print ">>> Farming via "+cmd_str

View File

@ -55,13 +55,22 @@ def genotypes2allelefrequencies(ref, genotypes, nIndividuals = -1):
nAltChroms = nChroms - nRefChroms
p = float(nRefChroms) / nChroms
q = float(nAltChroms) / nChroms
#print 'genotypes', genotypes
#print 'alleles', alleles
#print 'nChroms', nChroms
#print 'nCalledChroms', nCalledChroms
#print 'nMissingChroms', nMissingChroms
#print 'nRefChroms', nRefChroms
#print 'nAltChroms', nAltChroms
#print 'p, q', p, q
assert p + q == 1
return [p, q, n]
class PicardSNP:
def __init__( self, loc, ref, polymorphism, heterozygosity, allelefrequencies, genotypes):
def __init__( self, loc, ref, polymorphism, heterozygosity, allelefrequencies, genotypes, sources):
self.loc = loc
self.ref = ref
self.polymorphism = polymorphism
@ -69,6 +78,7 @@ class PicardSNP:
self.nIndividuals = allelefrequencies[2]
self.allelefrequencies = allelefrequencies
self.genotypes = genotypes
self.sources = sources
def refGenotype(self):
return self.ref + self.ref
@ -92,7 +102,7 @@ class PicardSNP:
return self.allelefrequencies[1]
def countGenotype(self, genotype):
r = len(filter( lambda x: x == genotype, self.genotypes ))
r = len(filter( lambda x: sorted(x) == sorted(genotype), self.genotypes ))
#print 'countGenotype', genotype, self.genotypes, r
return r
@ -105,7 +115,6 @@ class PicardSNP:
def nHomVarGenotypes(self):
return self.countGenotype(self.homVarGenotype())
def __str__(self):
return '%s %s %s %s %s %s' % ( self.loc, self.ref, str(self.polymorphism), str(self.het()), str(self.allelefrequencies), str(self.genotypes))
@ -132,7 +141,7 @@ def aggregatedGeliCalls2SNP( geliCallsAtSite, nIndividuals ):
#print 'polymorphism', polymorphism
genotype = list(geliCallsAtSite[1][0][5])
return PicardSNP(loc, refBase, polymorphism, genotypes2heterozygosity(genotypes, nIndividuals), genotypes2allelefrequencies(refBase, genotypes, nIndividuals), genotypes)
return PicardSNP(loc, refBase, polymorphism, genotypes2heterozygosity(genotypes, nIndividuals), genotypes2allelefrequencies(refBase, genotypes, nIndividuals), genotypes, [])
#return '%s %s %s 0.002747 -411.622578 -420.661738 0.000000 9.039160 364.000000 %d 1 0' % (loc, genotype[0], genotype[1], len(geliCallsAtSite))