major restructuring of generalized variant analysis framework. Now trivally easy to add additional analyses. Easy partitioning of all analyses by features, such as singleton status. Now has transition/transversional bias, counting, dbSNP coverage, HWE violation, selecting of variants by presence/absense in dbs. Also restructured the ROD system to make it easier to add tracks. Also, added the interval track -- if you provide an interval list, then the system autoatmically makese this available to you as a bound rod -- you can always find out where you are in the interval at every site. Python scripts improved to handle more merging, etc, into population snps.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@918 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
400399f1b8
commit
819862e04e
|
|
@ -8,12 +8,11 @@ import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
|
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RODIterator;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.IntervalRodIterator;
|
||||||
import org.broadinstitute.sting.gatk.traversals.*;
|
import org.broadinstitute.sting.gatk.traversals.*;
|
||||||
import org.broadinstitute.sting.gatk.walkers.*;
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.*;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
|
||||||
import org.broadinstitute.sting.utils.cmdLine.ArgumentException;
|
import org.broadinstitute.sting.utils.cmdLine.ArgumentException;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
|
@ -77,6 +76,8 @@ public class GenomeAnalysisEngine {
|
||||||
bindConvenienceRods("hapmap", "HapMapAlleleFrequencies", argCollection.HAPMAPFile);
|
bindConvenienceRods("hapmap", "HapMapAlleleFrequencies", argCollection.HAPMAPFile);
|
||||||
if (argCollection.HAPMAPChipFile != null)
|
if (argCollection.HAPMAPChipFile != null)
|
||||||
bindConvenienceRods("hapmap-chip", "GFF", argCollection.HAPMAPChipFile);
|
bindConvenienceRods("hapmap-chip", "GFF", argCollection.HAPMAPChipFile);
|
||||||
|
if ( argCollection.intervals != null )
|
||||||
|
bindConvenienceRods("interval", "Intervals", argCollection.intervals);
|
||||||
|
|
||||||
// parse out the rod bindings
|
// parse out the rod bindings
|
||||||
ReferenceOrderedData.parseBindings(logger, argCollection.RODBindings, rods);
|
ReferenceOrderedData.parseBindings(logger, argCollection.RODBindings, rods);
|
||||||
|
|
@ -114,7 +115,8 @@ public class GenomeAnalysisEngine {
|
||||||
genericEngineSetup(strictness);
|
genericEngineSetup(strictness);
|
||||||
|
|
||||||
// parse out any genomic location they've provided
|
// parse out any genomic location they've provided
|
||||||
List<GenomeLoc> locationsList = setupIntervalRegion();
|
//List<GenomeLoc> locationsList = setupIntervalRegion();
|
||||||
|
List<GenomeLoc> locationsList = engine.getLocations();
|
||||||
GenomeLocSortedSet locs = null;
|
GenomeLocSortedSet locs = null;
|
||||||
if (locationsList != null)
|
if (locationsList != null)
|
||||||
locs = GenomeLocSortedSet.createSetFromList(locationsList);
|
locs = GenomeLocSortedSet.createSetFromList(locationsList);
|
||||||
|
|
@ -202,7 +204,7 @@ public class GenomeAnalysisEngine {
|
||||||
|
|
||||||
// we default interval files over the genome region string
|
// we default interval files over the genome region string
|
||||||
if (argCollection.intervals != null) {
|
if (argCollection.intervals != null) {
|
||||||
engine.setLocation(setupIntervalRegion());
|
engine.setLocation(parseIntervalRegion(argCollection.intervals, false));
|
||||||
}
|
}
|
||||||
|
|
||||||
engine.setReadFilters(new Reads(argCollection));
|
engine.setReadFilters(new Reads(argCollection));
|
||||||
|
|
@ -217,15 +219,15 @@ public class GenomeAnalysisEngine {
|
||||||
*
|
*
|
||||||
* @return a list of genomeLoc representing the interval file
|
* @return a list of genomeLoc representing the interval file
|
||||||
*/
|
*/
|
||||||
private List<GenomeLoc> setupIntervalRegion() {
|
public static List<GenomeLoc> parseIntervalRegion(final String intervalsString, boolean quiet ) {
|
||||||
List<GenomeLoc> locs = null;
|
List<GenomeLoc> locs = null;
|
||||||
if (argCollection.intervals != null) {
|
if ( intervalsString != null) {
|
||||||
if (new File(argCollection.intervals).exists()) {
|
if (new File(intervalsString).exists()) {
|
||||||
logger.info("Intervals argument specifies a file. Loading intervals from file.");
|
if (! quiet) logger.info("Intervals argument specifies a file. Loading intervals from file.");
|
||||||
locs = GenomeLoc.IntervalFileToList(argCollection.intervals);
|
locs = GenomeLoc.IntervalFileToList(intervalsString);
|
||||||
} else {
|
} else {
|
||||||
logger.info("Intervals argument does not specify a file. Trying to parse it as a simple string.");
|
if (! quiet) logger.info("Intervals argument does not specify a file. Trying to parse it as a simple string.");
|
||||||
locs = GenomeLoc.parseGenomeLocs(argCollection.intervals);
|
locs = GenomeLoc.parseGenomeLocs(intervalsString);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return locs;
|
return locs;
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.dataSources.providers;
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RODIterator;
|
||||||
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.ReferenceOrderedDataSource;
|
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.ReferenceOrderedDataSource;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
|
||||||
|
|
@ -43,7 +44,7 @@ public class ReferenceOrderedView implements View {
|
||||||
public ReferenceOrderedView( ShardDataProvider provider ) {
|
public ReferenceOrderedView( ShardDataProvider provider ) {
|
||||||
this.provider = provider;
|
this.provider = provider;
|
||||||
for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() )
|
for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() )
|
||||||
states.add( new ReferenceOrderedDataState( dataSource, (ReferenceOrderedData.RODIterator)dataSource.seek(provider.getShard()) ) );
|
states.add( new ReferenceOrderedDataState( dataSource, (RODIterator)dataSource.seek(provider.getShard()) ) );
|
||||||
|
|
||||||
provider.register(this);
|
provider.register(this);
|
||||||
}
|
}
|
||||||
|
|
@ -78,9 +79,9 @@ public class ReferenceOrderedView implements View {
|
||||||
*/
|
*/
|
||||||
private class ReferenceOrderedDataState {
|
private class ReferenceOrderedDataState {
|
||||||
public final ReferenceOrderedDataSource dataSource;
|
public final ReferenceOrderedDataSource dataSource;
|
||||||
public final ReferenceOrderedData.RODIterator iterator;
|
public final RODIterator iterator;
|
||||||
|
|
||||||
public ReferenceOrderedDataState( ReferenceOrderedDataSource dataSource, ReferenceOrderedData.RODIterator iterator ) {
|
public ReferenceOrderedDataState( ReferenceOrderedDataSource dataSource, RODIterator iterator ) {
|
||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
this.iterator = iterator;
|
this.iterator = iterator;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RODIterator;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
|
||||||
|
|
@ -30,12 +31,12 @@ class IteratorPool {
|
||||||
/**
|
/**
|
||||||
* All iterators of this reference-ordered data.
|
* All iterators of this reference-ordered data.
|
||||||
*/
|
*/
|
||||||
private List<ReferenceOrderedData.RODIterator> allIterators = new ArrayList<ReferenceOrderedData.RODIterator>();
|
private List<RODIterator> allIterators = new ArrayList<RODIterator>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* All iterators that are not currently in service.
|
* All iterators that are not currently in service.
|
||||||
*/
|
*/
|
||||||
private List<ReferenceOrderedData.RODIterator> availableIterators = new ArrayList<ReferenceOrderedData.RODIterator>();
|
private List<RODIterator> availableIterators = new ArrayList<RODIterator>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new iterator pool given the current ROD.
|
* Create a new iterator pool given the current ROD.
|
||||||
|
|
@ -50,10 +51,10 @@ class IteratorPool {
|
||||||
* @param position Target position for the iterator.
|
* @param position Target position for the iterator.
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public ReferenceOrderedData.RODIterator iterator( GenomeLoc position ) {
|
public RODIterator iterator( GenomeLoc position ) {
|
||||||
// Grab the first iterator in the list whose position is before the requested position.
|
// Grab the first iterator in the list whose position is before the requested position.
|
||||||
ReferenceOrderedData.RODIterator selectedIterator = null;
|
RODIterator selectedIterator = null;
|
||||||
for( ReferenceOrderedData.RODIterator iterator: availableIterators ) {
|
for( RODIterator iterator: availableIterators ) {
|
||||||
if( (iterator.position() == null && iterator.hasNext()) ||
|
if( (iterator.position() == null && iterator.hasNext()) ||
|
||||||
(iterator.position() != null && iterator.position().isBefore(position)) ) {
|
(iterator.position() != null && iterator.position().isBefore(position)) ) {
|
||||||
selectedIterator = iterator;
|
selectedIterator = iterator;
|
||||||
|
|
@ -79,7 +80,7 @@ class IteratorPool {
|
||||||
* Close the given iterator, returning it to the pool.
|
* Close the given iterator, returning it to the pool.
|
||||||
* @param iterator Iterator to return to the pool.
|
* @param iterator Iterator to return to the pool.
|
||||||
*/
|
*/
|
||||||
public void close( ReferenceOrderedData.RODIterator iterator ) {
|
public void close( RODIterator iterator ) {
|
||||||
if( !allIterators.contains(iterator) )
|
if( !allIterators.contains(iterator) )
|
||||||
throw new StingException("Iterator does not belong to the given pool.");
|
throw new StingException("Iterator does not belong to the given pool.");
|
||||||
availableIterators.add(iterator);
|
availableIterators.add(iterator);
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RODIterator;
|
||||||
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
|
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
|
@ -59,7 +60,7 @@ public class ReferenceOrderedDataSource implements SimpleDataSource {
|
||||||
* @return Iterator through the data.
|
* @return Iterator through the data.
|
||||||
*/
|
*/
|
||||||
public Iterator seek( Shard shard ) {
|
public Iterator seek( Shard shard ) {
|
||||||
ReferenceOrderedData.RODIterator iterator = iteratorPool.iterator(shard.getGenomeLoc());
|
RODIterator iterator = iteratorPool.iterator(shard.getGenomeLoc());
|
||||||
return iterator;
|
return iterator;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -67,7 +68,7 @@ public class ReferenceOrderedDataSource implements SimpleDataSource {
|
||||||
* Close the specified iterator, returning it to the pool.
|
* Close the specified iterator, returning it to the pool.
|
||||||
* @param iterator Iterator to close.
|
* @param iterator Iterator to close.
|
||||||
*/
|
*/
|
||||||
public void close( ReferenceOrderedData.RODIterator iterator ) {
|
public void close( RODIterator iterator ) {
|
||||||
this.iteratorPool.close(iterator);
|
this.iteratorPool.close(iterator);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,41 @@
|
||||||
|
package org.broadinstitute.sting.gatk.refdata;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
public class IntervalRod extends BasicReferenceOrderedDatum {
|
||||||
|
private GenomeLoc loc = null;
|
||||||
|
|
||||||
|
public IntervalRod(String name) {
|
||||||
|
super(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
public IntervalRod(String name, GenomeLoc loc) {
|
||||||
|
super(name);
|
||||||
|
this.loc = loc;
|
||||||
|
}
|
||||||
|
|
||||||
|
public GenomeLoc getLocation() { return loc; }
|
||||||
|
|
||||||
|
/** Required by ReferenceOrderedDatum interface; this method does nothing (always returns false),
|
||||||
|
* since this rod provides its own iterator for reading underlying data files.
|
||||||
|
*/
|
||||||
|
public boolean parseLine(Object header, String[] parts) {
|
||||||
|
return false; // this rod has its own iterator
|
||||||
|
}
|
||||||
|
|
||||||
|
public String repl() {
|
||||||
|
throw new StingException("repl() is not implemented yet");
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toSimpleString() { return loc.toString(); }
|
||||||
|
public String toString() { return toSimpleString(); }
|
||||||
|
|
||||||
|
public static IntervalRodIterator createIterator(String trackName, File f) throws IOException {
|
||||||
|
return IntervalRodIterator.IntervalRodIteratorFromLocsFile(trackName, f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,52 @@
|
||||||
|
package org.broadinstitute.sting.gatk.refdata;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.BasicReferenceOrderedDatum;
|
||||||
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.Pair;
|
||||||
|
|
||||||
|
public class IntervalRodIterator implements Iterator<IntervalRod> {
|
||||||
|
private List<GenomeLoc> locations = null;
|
||||||
|
private Iterator<GenomeLoc> iter;
|
||||||
|
private String trackName = null;
|
||||||
|
|
||||||
|
//public static RODIterator<IntervalRod> IntervalRodIteratorFromLocs(List<GenomeLoc> locs) {
|
||||||
|
// IntervalRodIterator it = new IntervalRodIterator("interval", locs);
|
||||||
|
// return new RODIterator<IntervalRod>(it);
|
||||||
|
//}
|
||||||
|
|
||||||
|
public static IntervalRodIterator IntervalRodIteratorFromLocsFile(final String trackName, final File file) {
|
||||||
|
//System.out.printf("Parsing %s for intervals %s%n", file, trackName);
|
||||||
|
List<GenomeLoc> locs = GenomeAnalysisEngine.parseIntervalRegion(file.getPath(), true);
|
||||||
|
//System.out.printf(" => got %d entries %n", locs.size());
|
||||||
|
return new IntervalRodIterator(trackName, locs);
|
||||||
|
}
|
||||||
|
|
||||||
|
public IntervalRodIterator(String trackName, List<GenomeLoc> locs) {
|
||||||
|
this.trackName = trackName;
|
||||||
|
locations = locs;
|
||||||
|
iter = locations.iterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasNext() {
|
||||||
|
return iter.hasNext();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public IntervalRod next() {
|
||||||
|
IntervalRod r = new IntervalRod(trackName, iter.next());
|
||||||
|
//System.out.printf("IntervalRod next is %s%n", r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void remove() {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -22,7 +22,7 @@ import org.apache.log4j.Logger;
|
||||||
* chr1:1104842 A N 0.000000 -84.816002 -84.816002 0.000000 0.000000 324.000000 162 0 0
|
* chr1:1104842 A N 0.000000 -84.816002 -84.816002 0.000000 0.000000 324.000000 162 0 0
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class PooledEMSNPROD extends TabularROD implements AllelicVariant {
|
public class PooledEMSNPROD extends TabularROD implements SNPCallFromGenotypes {
|
||||||
public PooledEMSNPROD(final String name) {
|
public PooledEMSNPROD(final String name) {
|
||||||
super(name);
|
super(name);
|
||||||
}
|
}
|
||||||
|
|
@ -45,4 +45,11 @@ public class PooledEMSNPROD extends TabularROD implements AllelicVariant {
|
||||||
public List<String> getGenotype() throws IllegalStateException { throw new IllegalStateException(); }
|
public List<String> getGenotype() throws IllegalStateException { throw new IllegalStateException(); }
|
||||||
public int getPloidy() throws IllegalStateException { return 2; }
|
public int getPloidy() throws IllegalStateException { return 2; }
|
||||||
public boolean isBiallelic() { return true; }
|
public boolean isBiallelic() { return true; }
|
||||||
|
|
||||||
|
// SNPCallFromGenotypes interface
|
||||||
|
public int nIndividuals() { return Integer.parseInt(this.get("EM_N")); }
|
||||||
|
public int nHomRefGenotypes() { return Integer.parseInt(this.get("n_ref")); }
|
||||||
|
public int nHetGenotypes() { return Integer.parseInt(this.get("n_het")); }
|
||||||
|
public int nHomVarGenotypes() { return Integer.parseInt(this.get("n_hom")); }
|
||||||
|
public List<Genotype> getGenotypes() { return null; }
|
||||||
}
|
}
|
||||||
|
|
@ -0,0 +1,95 @@
|
||||||
|
package org.broadinstitute.sting.gatk.refdata;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
public class RODIterator<ROD extends ReferenceOrderedDatum> implements Iterator<ROD> {
|
||||||
|
private PushbackIterator<ROD> it;
|
||||||
|
private ROD current = null;
|
||||||
|
private GenomeLoc position = null;
|
||||||
|
|
||||||
|
public RODIterator(Iterator<ROD> it) {
|
||||||
|
this.it = new PushbackIterator<ROD>(it);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasNext() { return it.hasNext(); }
|
||||||
|
public ROD next() {
|
||||||
|
ROD next = it.next();
|
||||||
|
if( next != null ) {
|
||||||
|
position = next.getLocation().clone();
|
||||||
|
current = next;
|
||||||
|
}
|
||||||
|
return next;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the current position of this iterator.
|
||||||
|
* @return Current position of the iterator, or null if no position exists.
|
||||||
|
*/
|
||||||
|
public GenomeLoc position() {
|
||||||
|
return position;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Seeks forward in the file until we reach (or cross) a record at contig / pos
|
||||||
|
* If we don't find anything and cross beyond contig / pos, we return null
|
||||||
|
* Otherwise we return the first object who's start is at pos
|
||||||
|
*
|
||||||
|
* @param loc
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public ROD seekForward(final GenomeLoc loc) {
|
||||||
|
final boolean DEBUG = false;
|
||||||
|
|
||||||
|
ROD result = null;
|
||||||
|
|
||||||
|
//if (current != null && current.getName().equals("interval")) {
|
||||||
|
// boolean contains = current.getLocation().containsP(loc);
|
||||||
|
// System.out.printf(" %s : current is %s, seeking to %s, contains %b%n", current.getName(), current.getLocation(), loc, contains);
|
||||||
|
//}
|
||||||
|
|
||||||
|
if ( current != null && current.getLocation().containsP(loc) )
|
||||||
|
return current;
|
||||||
|
|
||||||
|
if ( DEBUG ) System.out.printf(" *** starting seek to %s %d%n", loc.getContig(), loc.getStart());
|
||||||
|
while ( hasNext() ) {
|
||||||
|
ROD proposed = next();
|
||||||
|
if( proposed == null )
|
||||||
|
continue;
|
||||||
|
//System.out.printf(" -> Seeking to %s %d AT %s %d%n", contigName, pos, current.getContig(), current.getStart());
|
||||||
|
boolean containedP = proposed.getLocation().containsP(loc);
|
||||||
|
//System.out.printf(" %s -> Seeking to %s, at %s => contains = %b%n", current.getName(), loc, current.getLocation(), containedP);
|
||||||
|
int cmp = proposed.getLocation().compareTo(loc);
|
||||||
|
if ( cmp < 0 ) {
|
||||||
|
// current occurs before loc, continue searching
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
else if ( cmp == 0 || containedP ) {
|
||||||
|
result = proposed;
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
// current is after loc
|
||||||
|
it.pushback(proposed);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( DEBUG ) {
|
||||||
|
if ( result != null )
|
||||||
|
System.out.printf(" ### Found %s%n", result.getLocation());
|
||||||
|
}
|
||||||
|
|
||||||
|
// make a note that the iterator last seeked to the specified position
|
||||||
|
current = result;
|
||||||
|
position = loc.clone();
|
||||||
|
|
||||||
|
// we ran out of elements or found something
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void remove() {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -94,7 +94,7 @@ public class RefMetaDataTracker {
|
||||||
* @param rod
|
* @param rod
|
||||||
*/
|
*/
|
||||||
public void bind(final String name, ReferenceOrderedDatum rod) {
|
public void bind(final String name, ReferenceOrderedDatum rod) {
|
||||||
logger.debug(String.format("Binding %s to %s%n", name, rod));
|
//logger.debug(String.format("Binding %s to %s", name, rod));
|
||||||
map.put(canonicalName(name), rod);
|
map.put(canonicalName(name), rod);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -66,6 +66,7 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
||||||
addModule("RefSeq", rodRefSeq.class);
|
addModule("RefSeq", rodRefSeq.class);
|
||||||
addModule("Table", TabularROD.class);
|
addModule("Table", TabularROD.class);
|
||||||
addModule("PooledEM", PooledEMSNPROD.class);
|
addModule("PooledEM", PooledEMSNPROD.class);
|
||||||
|
addModule("Intervals", IntervalRod.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -159,7 +160,7 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
||||||
return this.name.equals(name) && type.isAssignableFrom(this.type);
|
return this.name.equals(name) && type.isAssignableFrom(this.type);
|
||||||
}
|
}
|
||||||
|
|
||||||
public RODIterator iterator() {
|
public RODIterator<ROD> iterator() {
|
||||||
Iterator<ROD> it;
|
Iterator<ROD> it;
|
||||||
try {
|
try {
|
||||||
Method m = type.getDeclaredMethod("createIterator", String.class,java.io.File.class);
|
Method m = type.getDeclaredMethod("createIterator", String.class,java.io.File.class);
|
||||||
|
|
@ -177,7 +178,7 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
||||||
} catch ( java.lang.reflect.InvocationTargetException e ) {
|
} catch ( java.lang.reflect.InvocationTargetException e ) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
return new RODIterator(it);
|
return new RODIterator<ROD>(it);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ----------------------------------------------------------------------
|
// ----------------------------------------------------------------------
|
||||||
|
|
@ -301,103 +302,6 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// private class SimpleRODIterator implements Iterator<ROD> {
|
|
||||||
// //private WhitespaceTextFileParser parser = null;
|
|
||||||
// private TabbedTextFileParser parser = null;
|
|
||||||
//
|
|
||||||
// public SimpleRODIterator() {
|
|
||||||
// parser = new TabbedTextFileParser(true, file);
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// public boolean hasNext() {
|
|
||||||
// return parser.hasNext();
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// public ROD next() {
|
|
||||||
// String parts[] = parser.next();
|
|
||||||
// return parseLine(parts);
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// public void remove() {
|
|
||||||
// throw new UnsupportedOperationException();
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
public class RODIterator implements Iterator<ROD> {
|
|
||||||
private PushbackIterator<ROD> it;
|
|
||||||
private GenomeLoc position = null;
|
|
||||||
|
|
||||||
public RODIterator(Iterator<ROD> it) {
|
|
||||||
this.it = new PushbackIterator<ROD>(it);
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean hasNext() { return it.hasNext(); }
|
|
||||||
public ROD next() {
|
|
||||||
ROD next = it.next();
|
|
||||||
if( next != null )
|
|
||||||
position = next.getLocation().clone();
|
|
||||||
return next;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the current position of this iterator.
|
|
||||||
* @return Current position of the iterator, or null if no position exists.
|
|
||||||
*/
|
|
||||||
public GenomeLoc position() {
|
|
||||||
return position;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Seeks forward in the file until we reach (or cross) a record at contig / pos
|
|
||||||
* If we don't find anything and cross beyond contig / pos, we return null
|
|
||||||
* Otherwise we return the first object who's start is at pos
|
|
||||||
*
|
|
||||||
* @param loc
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
public ROD seekForward(final GenomeLoc loc) {
|
|
||||||
final boolean DEBUG = false;
|
|
||||||
|
|
||||||
ROD result = null;
|
|
||||||
|
|
||||||
if ( DEBUG ) System.out.printf(" *** starting seek to %s %d%n", loc.getContig(), loc.getStart());
|
|
||||||
while ( hasNext() ) {
|
|
||||||
ROD current = next();
|
|
||||||
if( current == null )
|
|
||||||
continue;
|
|
||||||
//System.out.printf(" -> Seeking to %s %d AT %s %d%n", contigName, pos, current.getContig(), current.getStart());
|
|
||||||
int cmp = current.getLocation().compareTo(loc);
|
|
||||||
if ( cmp < 0 ) {
|
|
||||||
// current occurs before loc, continue searching
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
else if ( cmp == 0 ) {
|
|
||||||
result = current;
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
// current is after loc
|
|
||||||
it.pushback(current);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( DEBUG ) {
|
|
||||||
if ( result != null )
|
|
||||||
System.out.printf(" ### Found %s%n", result.getLocation());
|
|
||||||
}
|
|
||||||
|
|
||||||
// make a note that the iterator last seeked to the specified position
|
|
||||||
position = loc.clone();
|
|
||||||
|
|
||||||
// we ran out of elements or found something
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void remove() {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ----------------------------------------------------------------------
|
// ----------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// Parsing
|
// Parsing
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,31 @@
|
||||||
|
package org.broadinstitute.sting.gatk.refdata;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.regex.MatchResult;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
import org.broadinstitute.sting.utils.xReadLines;
|
||||||
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom
|
||||||
|
* chr1:1104840 A N 0.000000 -85.341265 -85.341265 0.000000 0.000000 324.000000 162 0 0
|
||||||
|
* chr1:1104841 C N 0.000000 -69.937928 -69.937928 0.000000 0.000000 324.000000 162 0 0
|
||||||
|
* chr1:1104842 A N 0.000000 -84.816002 -84.816002 0.000000 0.000000 324.000000 162 0 0
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public interface SNPCallFromGenotypes extends AllelicVariant {
|
||||||
|
public int nIndividuals();
|
||||||
|
public int nHomRefGenotypes();
|
||||||
|
public int nHetGenotypes();
|
||||||
|
public int nHomVarGenotypes();
|
||||||
|
public List<Genotype> getGenotypes();
|
||||||
|
}
|
||||||
|
|
@ -11,9 +11,7 @@ import net.sf.samtools.SAMRecord;
|
||||||
import net.sf.samtools.util.RuntimeIOException;
|
import net.sf.samtools.util.RuntimeIOException;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.iterators.*;
|
import org.broadinstitute.sting.gatk.iterators.*;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.*;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||||
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
|
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
|
||||||
import org.broadinstitute.sting.gatk.dataSources.providers.ShardDataProvider;
|
import org.broadinstitute.sting.gatk.dataSources.providers.ShardDataProvider;
|
||||||
|
|
@ -22,7 +20,6 @@ import org.broadinstitute.sting.utils.*;
|
||||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileNotFoundException;
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
public abstract class TraversalEngine {
|
public abstract class TraversalEngine {
|
||||||
|
|
@ -30,7 +27,7 @@ public abstract class TraversalEngine {
|
||||||
protected List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods = null;
|
protected List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods = null;
|
||||||
|
|
||||||
// Iterator over rods
|
// Iterator over rods
|
||||||
List<Pair<String, ReferenceOrderedData<? extends ReferenceOrderedDatum>.RODIterator>> rodIters;
|
List<Pair<String, RODIterator<? extends ReferenceOrderedDatum>>> rodIters;
|
||||||
|
|
||||||
// How strict should we be with SAM/BAM parsing?
|
// How strict should we be with SAM/BAM parsing?
|
||||||
protected ValidationStringency strictness = ValidationStringency.STRICT;
|
protected ValidationStringency strictness = ValidationStringency.STRICT;
|
||||||
|
|
@ -227,6 +224,10 @@ public abstract class TraversalEngine {
|
||||||
return this.locs != null && !this.locs.isEmpty();
|
return this.locs != null && !this.locs.isEmpty();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public List<GenomeLoc> getLocations() {
|
||||||
|
return this.locs;
|
||||||
|
}
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// printing
|
// printing
|
||||||
|
|
@ -446,9 +447,9 @@ public abstract class TraversalEngine {
|
||||||
*/
|
*/
|
||||||
protected void initializeRODs() {
|
protected void initializeRODs() {
|
||||||
// set up reference ordered data
|
// set up reference ordered data
|
||||||
rodIters = new ArrayList<Pair<String, ReferenceOrderedData<? extends ReferenceOrderedDatum>.RODIterator>>();
|
rodIters = new ArrayList<Pair<String, RODIterator<? extends ReferenceOrderedDatum>>>();
|
||||||
for (ReferenceOrderedData<? extends ReferenceOrderedDatum> data : rods) {
|
for (ReferenceOrderedData<? extends ReferenceOrderedDatum> data : rods) {
|
||||||
rodIters.add(new Pair<String, ReferenceOrderedData<? extends ReferenceOrderedDatum>.RODIterator>(data.getName(), data.iterator()));
|
rodIters.add(new Pair<String, RODIterator<? extends ReferenceOrderedDatum>>(data.getName(), data.iterator()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -492,10 +493,11 @@ public abstract class TraversalEngine {
|
||||||
*/
|
*/
|
||||||
protected RefMetaDataTracker getReferenceOrderedDataAtLocus(final GenomeLoc loc) {
|
protected RefMetaDataTracker getReferenceOrderedDataAtLocus(final GenomeLoc loc) {
|
||||||
RefMetaDataTracker tracks = new RefMetaDataTracker();
|
RefMetaDataTracker tracks = new RefMetaDataTracker();
|
||||||
for (Pair<String, ReferenceOrderedData<? extends ReferenceOrderedDatum>.RODIterator> pair : rodIters) {
|
for (Pair<String, RODIterator<? extends ReferenceOrderedDatum>> pair : rodIters) {
|
||||||
String name = pair.getFirst();
|
String name = pair.getFirst();
|
||||||
tracks.bind(name, pair.getSecond().seekForward(loc));
|
tracks.bind(name, pair.getSecond().seekForward(loc));
|
||||||
}
|
}
|
||||||
|
|
||||||
return tracks;
|
return tracks;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,42 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.LocusContext;
|
||||||
|
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
public abstract class BasicVariantAnalysis implements VariantAnalysis {
|
||||||
|
protected String name;
|
||||||
|
protected PrintStream out;
|
||||||
|
protected VariantEvalWalker master;
|
||||||
|
|
||||||
|
public BasicVariantAnalysis(String name) {
|
||||||
|
this.name = name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getName() {
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getParams() {
|
||||||
|
return new ArrayList<String>();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void initialize(VariantEvalWalker master, PrintStream out) {
|
||||||
|
this.master = master;
|
||||||
|
this.out = out;
|
||||||
|
}
|
||||||
|
|
||||||
|
public PrintStream getPrintStream() {
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> done() {
|
||||||
|
return new ArrayList<String>();
|
||||||
|
}
|
||||||
|
|
||||||
|
public abstract String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context);
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,85 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.SNPCallFromGenotypes;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.PooledEMSNPROD;
|
||||||
|
import org.broadinstitute.sting.gatk.LocusContext;
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
|
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import cern.jet.math.Arithmetic;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: depristo
|
||||||
|
* Date: Jun 4, 2009
|
||||||
|
* Time: 4:38:00 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class HardyWeinbergEquilibrium extends BasicVariantAnalysis {
|
||||||
|
private double threshold;
|
||||||
|
int nSites = 0;
|
||||||
|
int nViolations = 0;
|
||||||
|
|
||||||
|
HardyWeinbergEquilibrium(double threshold) {
|
||||||
|
super("hwe");
|
||||||
|
this.threshold = threshold;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||||
|
String r = null;
|
||||||
|
|
||||||
|
if ( eval != null &&
|
||||||
|
eval instanceof SNPCallFromGenotypes ) {
|
||||||
|
nSites++;
|
||||||
|
SNPCallFromGenotypes call = (SNPCallFromGenotypes)eval;
|
||||||
|
|
||||||
|
int nAA = call.nHomRefGenotypes();
|
||||||
|
int nAa = call.nHetGenotypes();
|
||||||
|
int naa = call.nHomVarGenotypes();
|
||||||
|
int nA = 2 * nAA + nAa;
|
||||||
|
int n = nAA + nAa + naa;
|
||||||
|
|
||||||
|
//
|
||||||
|
// from Emigh 1980
|
||||||
|
//
|
||||||
|
// P = pr[nAa | nA] = multinomial[n over nAA, nAa, naa] / binomial[2n over nA] * 2^nAa
|
||||||
|
//
|
||||||
|
// where nAA, nAa, naa are the observed numbers of the three genotypes, AA, Aa, and aa,
|
||||||
|
// respectively, and nA is the number of A alleles, where nA = 2nAA + nAa, and n is the number of alleles
|
||||||
|
//
|
||||||
|
int[] mXs = { nAA, nAa, naa }; // counts of each genotype as vector
|
||||||
|
double m = MathUtils.multinomial(mXs);
|
||||||
|
double b = Arithmetic.binomial(2 * n, nA);
|
||||||
|
double tosses = Math.pow(2, nAa);
|
||||||
|
double p = (m / b) * tosses;
|
||||||
|
|
||||||
|
if ( false ) {
|
||||||
|
System.out.printf("HWE-violation at %s %f < %f %1.2f %5d %5d %5d %5d %5d %.2e %.2e %.2e => %.6e [%s]%n",
|
||||||
|
call.getLocation(), p, threshold, call.getMAF(), nAA, nAa, naa, nA, n, m, b, tosses, p, eval);
|
||||||
|
System.out.printf("(factorial(%d) / (factorial(%d) * factorial(%d) * factorial(%d))) / choose(%d, %d) * 2^%d - %f < 1e-3%n",
|
||||||
|
nAA + nAa + naa, nAA, nAa, naa, 2 * n, nA, nAa, p);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( p < threshold ) {
|
||||||
|
r = String.format("HWE-violation %f < %f at %s", p, threshold, eval);
|
||||||
|
nViolations++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> done() {
|
||||||
|
List<String> s = new ArrayList<String>();
|
||||||
|
s.add(String.format("n_calls %d", nSites));
|
||||||
|
s.add(String.format("n_violations %d", nViolations));
|
||||||
|
s.add(String.format("violations_rate %.2f", (100.0*nSites) / nViolations));
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,86 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.IntervalRod;
|
||||||
|
import org.broadinstitute.sting.gatk.LocusContext;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.io.PrintStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: depristo
|
||||||
|
* Date: Jun 4, 2009
|
||||||
|
* Time: 4:38:19 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class PairwiseDistanceAnalysis extends BasicVariantAnalysis {
|
||||||
|
ArrayList<Long> pairWiseDistances;
|
||||||
|
int[] pairWiseBoundries = {1, 2, 5, 10, 20, 50, 100, 1000, 10000};
|
||||||
|
|
||||||
|
AllelicVariant lastVariant = null;
|
||||||
|
GenomeLoc lastVariantInterval = null;
|
||||||
|
|
||||||
|
public PairwiseDistanceAnalysis() {
|
||||||
|
super("pairwise_distances");
|
||||||
|
pairWiseDistances = new ArrayList<Long>();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||||
|
String r = null;
|
||||||
|
|
||||||
|
if ( eval != null ) {
|
||||||
|
IntervalRod intervalROD = (IntervalRod)tracker.lookup("interval", null);
|
||||||
|
GenomeLoc interval = intervalROD == null ? null : intervalROD.getLocation();
|
||||||
|
|
||||||
|
if (lastVariant != null) {
|
||||||
|
GenomeLoc eL = eval.getLocation();
|
||||||
|
GenomeLoc lvL = lastVariant.getLocation();
|
||||||
|
if (eL.getContigIndex() == lvL.getContigIndex()) {
|
||||||
|
long d = eL.distance(lvL);
|
||||||
|
if ( lastVariantInterval != null && lastVariantInterval.compareTo(interval) != 0) {
|
||||||
|
// we're on different intervals
|
||||||
|
//out.printf("# Excluding %d %s %s vs. %s %s%n", d, eL, interval, lvL, lastVariantInterval);
|
||||||
|
} else {
|
||||||
|
pairWiseDistances.add(d);
|
||||||
|
r = String.format("Pairwise-distance %d %s %s%n", d, eL, lvL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
lastVariant = eval;
|
||||||
|
lastVariantInterval = interval;
|
||||||
|
}
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> done() {
|
||||||
|
int[] pairCounts = new int[pairWiseBoundries.length];
|
||||||
|
Arrays.fill(pairCounts, 0);
|
||||||
|
for ( long dist : pairWiseDistances ) {
|
||||||
|
boolean done = false;
|
||||||
|
for ( int i = 0; i < pairWiseBoundries.length && ! done ; i++ ) {
|
||||||
|
int maxDist = pairWiseBoundries[i];
|
||||||
|
if ( dist <= maxDist ) {
|
||||||
|
pairCounts[i]++;
|
||||||
|
done = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
List<String> s = new ArrayList<String>();
|
||||||
|
s.add(String.format("snps counted for pairwise distance: %d", pairWiseDistances.size()));
|
||||||
|
for ( int i = 0; i < pairWiseBoundries.length; i++ ) {
|
||||||
|
int maxDist = pairWiseBoundries[i];
|
||||||
|
int count = pairCounts[i];
|
||||||
|
s.add(String.format("snps within %8d bp: %d", maxDist, count));
|
||||||
|
}
|
||||||
|
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,74 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.LocusContext;
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: depristo
|
||||||
|
* Date: Jun 4, 2009
|
||||||
|
* Time: 4:38:00 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class TransitionTranversionAnalysis extends BasicVariantAnalysis {
|
||||||
|
int N_TRANSITION_TRANVERSION_BINS = 100;
|
||||||
|
Histogram<Integer> transitions;
|
||||||
|
Histogram<Integer> transversions;
|
||||||
|
|
||||||
|
public TransitionTranversionAnalysis() {
|
||||||
|
super("transitions_transversions");
|
||||||
|
transitions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
|
||||||
|
transversions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||||
|
if ( eval != null && eval.isSNP() ) {
|
||||||
|
char refBase = eval.getRefSnpFWD();
|
||||||
|
char altBase = eval.getAltSnpFWD();
|
||||||
|
//System.out.printf("%c %c%n", refBase, altBase);
|
||||||
|
//int i = transition_transversion_bin(dbsnp.getHeterozygosity());
|
||||||
|
//System.out.printf("MAF = %f => %d%n", dbsnp.getMAF(), i);
|
||||||
|
//EnumMap<BaseUtils.BaseSubstitutionType, Integer> bin = transition_transversion_counts[i];
|
||||||
|
|
||||||
|
BaseUtils.BaseSubstitutionType subType = BaseUtils.SNPSubstitutionType(refBase, altBase);
|
||||||
|
Histogram<Integer> h = subType == BaseUtils.BaseSubstitutionType.TRANSITION ? transitions : transversions;
|
||||||
|
double het = eval.getHeterozygosity();
|
||||||
|
h.setBin(het, h.getBin(het) + 1);
|
||||||
|
//int sit = bin.get(BaseUtils.BaseSubstitutionType.TRANSITION);
|
||||||
|
//int ver = bin.get(BaseUtils.BaseSubstitutionType.TRANSVERSION);
|
||||||
|
//System.out.printf("%s %.2f %s%n", subType, dbsnp.getHeterozygosity(), h.x2bin(logHet), dbsnp.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> done() {
|
||||||
|
int nTransitions = 0;
|
||||||
|
int nTransversions = 0;
|
||||||
|
|
||||||
|
List<String> s = new ArrayList<String>();
|
||||||
|
for ( int i = 0; i < N_TRANSITION_TRANVERSION_BINS; i++ ) {
|
||||||
|
//double avHet = Math.pow(10, transitions.bin2x(i));
|
||||||
|
double avHet = transitions.bin2x(i);
|
||||||
|
if ( avHet > 0.5 ) break;
|
||||||
|
|
||||||
|
int sit = transitions.getBin(i);
|
||||||
|
int ver = transversions.getBin(i);
|
||||||
|
nTransitions += sit;
|
||||||
|
nTransversions += ver;
|
||||||
|
double ratio = (float)sit/ver;
|
||||||
|
//s.append(String.format("%s %s %.2f %d %d %f%n", commentLine, prefix, avHet, sit, ver, ratio));
|
||||||
|
}
|
||||||
|
|
||||||
|
s.add(String.format("transitions %d", nTransitions));
|
||||||
|
s.add(String.format("transversions %d", nTransversions));
|
||||||
|
s.add(String.format("ratio %.2f", nTransitions / (1.0 * nTransversions)));
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,17 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.LocusContext;
|
||||||
|
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public interface VariantAnalysis {
|
||||||
|
public String getName();
|
||||||
|
public PrintStream getPrintStream();
|
||||||
|
public List<String> getParams();
|
||||||
|
public void initialize(VariantEvalWalker master, PrintStream out);
|
||||||
|
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context);
|
||||||
|
public List<String> done();
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,33 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.LocusContext;
|
||||||
|
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
public class VariantCounter extends BasicVariantAnalysis {
|
||||||
|
int nBasesCovered = 0;
|
||||||
|
int nSNPs = 0;
|
||||||
|
|
||||||
|
public VariantCounter() {
|
||||||
|
super("variant_counts");
|
||||||
|
}
|
||||||
|
|
||||||
|
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||||
|
nBasesCovered++;
|
||||||
|
nSNPs += eval == null ? 0 : 1;
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> done() {
|
||||||
|
List<String> s = new ArrayList<String>();
|
||||||
|
s.add(String.format("n bases covered: %d", nBasesCovered));
|
||||||
|
s.add(String.format("sites: %d", nSNPs));
|
||||||
|
s.add(String.format("variant rate: %.5f confident variants per base", nSNPs / (1.0 * nBasesCovered)));
|
||||||
|
s.add(String.format("variant rate: 1 / %d confident variants per base", nBasesCovered / nSNPs));
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,12 +1,22 @@
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||||
|
|
||||||
public class VariantDBCoverage {
|
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.LocusContext;
|
||||||
|
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
public class VariantDBCoverage extends BasicVariantAnalysis {
|
||||||
private String dbName;
|
private String dbName;
|
||||||
private int nDBObs = 0;
|
private int nDBObs = 0;
|
||||||
private int nEvalObs = 0;
|
private int nEvalObs = 0;
|
||||||
private int nOverlapping = 0;
|
private int nOverlapping = 0;
|
||||||
|
|
||||||
public VariantDBCoverage(final String name) {
|
public VariantDBCoverage(final String name) {
|
||||||
|
super("db_coverage");
|
||||||
dbName = name;
|
dbName = name;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -37,6 +47,13 @@ public class VariantDBCoverage {
|
||||||
return nOverlappingSites() / (1.0 * nEvalSites());
|
return nOverlappingSites() / (1.0 * nEvalSites());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||||
|
// There are four cases here:
|
||||||
|
AllelicVariant dbsnp = (AllelicVariant)tracker.lookup(dbName, null);
|
||||||
|
inc(dbsnp != null, eval != null);
|
||||||
|
return dbsnp == null && eval != null ? "Novel " + eval : null;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* What fraction of the DB sites were discovered in the evalution calls?
|
* What fraction of the DB sites were discovered in the evalution calls?
|
||||||
*
|
*
|
||||||
|
|
@ -46,18 +63,15 @@ public class VariantDBCoverage {
|
||||||
return nOverlappingSites() / (1.0 * nDBSites());
|
return nOverlappingSites() / (1.0 * nDBSites());
|
||||||
}
|
}
|
||||||
|
|
||||||
public String toSingleLineString(final String prefix) {
|
public List<String> done() {
|
||||||
return String.format("%s %d\t%d\t%d\t%.2f\t%.2f", prefix, nDBSites(), nEvalSites(), nOverlappingSites(), fractionEvalSitesCoveredByDB(), fractionDBSitesDiscoveredInEval());
|
List<String> s = new ArrayList<String>();
|
||||||
}
|
s.add(String.format("%d\t%d\t%d\t%.2f\t%.2f", nDBSites(), nEvalSites(), nOverlappingSites(), fractionEvalSitesCoveredByDB(), fractionDBSitesDiscoveredInEval()));
|
||||||
|
s.add(String.format("name %s", dbName));
|
||||||
public String toMultiLineString(final String prefix) {
|
s.add(String.format("n_db_sites %d", nDBSites()));
|
||||||
StringBuilder s = new StringBuilder();
|
s.add(String.format("n_eval_sites %d", nEvalSites()));
|
||||||
s.append(String.format("%s name %s%n", prefix, dbName));
|
s.add(String.format("n_overlapping_sites %d", nOverlappingSites()));
|
||||||
s.append(String.format("%s n_db_sites %d%n", prefix, nDBSites()));
|
s.add(String.format("per_eval_sites_in_db %.2f", 100*fractionEvalSitesCoveredByDB()));
|
||||||
s.append(String.format("%s n_eval_sites %d%n", prefix, nEvalSites()));
|
s.add(String.format("per_db_sites_in_eval %.2f", 100*fractionDBSitesDiscoveredInEval()));
|
||||||
s.append(String.format("%s n_overlapping_sites %d%n", prefix, nOverlappingSites()));
|
return s;
|
||||||
s.append(String.format("%s per_eval_sites_in_db %.2f%n", prefix, 100*fractionEvalSitesCoveredByDB()));
|
|
||||||
s.append(String.format("%s per_db_sites_in_eval %.2f%n", prefix, 100*fractionDBSitesDiscoveredInEval()));
|
|
||||||
return s.toString();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1,24 +1,16 @@
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.*;
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.*;
|
||||||
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
|
||||||
import org.broadinstitute.sting.gatk.LocusContext;
|
import org.broadinstitute.sting.gatk.LocusContext;
|
||||||
import org.broadinstitute.sting.utils.BaseUtils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.Pair;
|
||||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
|
||||||
import java.util.EnumMap;
|
import java.util.*;
|
||||||
import java.util.ArrayList;
|
import java.io.*;
|
||||||
import java.util.Arrays;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Created by IntelliJ IDEA.
|
|
||||||
* User: mdepristo
|
|
||||||
* Date: Feb 22, 2009
|
|
||||||
* Time: 2:52:28 PM
|
|
||||||
* To change this template use File | Settings | File Templates.
|
|
||||||
*/
|
|
||||||
@By(DataSource.REFERENCE)
|
@By(DataSource.REFERENCE)
|
||||||
@Requires(DataSource.REFERENCE)
|
@Requires(DataSource.REFERENCE)
|
||||||
@Allows(DataSource.REFERENCE)
|
@Allows(DataSource.REFERENCE)
|
||||||
|
|
@ -29,110 +21,114 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
||||||
@Argument(shortName="printVariants", doc="If true, prints the variants in all of the variant tracks that are examined", required=false)
|
@Argument(shortName="printVariants", doc="If true, prints the variants in all of the variant tracks that are examined", required=false)
|
||||||
public boolean printVariants = false;
|
public boolean printVariants = false;
|
||||||
|
|
||||||
int nBasesCovered = 0;
|
@Argument(shortName="badHWEThreshold", doc="XXX", required=false)
|
||||||
int nSNPs = 0;
|
public double badHWEThreshold = 0.001;
|
||||||
VariantDBCoverage dbSNPStats = new VariantDBCoverage("dbSNP");
|
|
||||||
|
|
||||||
int N_TRANSITION_TRANVERSION_BINS = 100;
|
String analysisFilenameBase = null;
|
||||||
Histogram<Integer> transitions;
|
|
||||||
Histogram<Integer> transversions;
|
|
||||||
|
|
||||||
@Argument(shortName="outputFilenameBase", doc="", required=false)
|
String COMMENT_STRING = "";
|
||||||
String outputFilenameBase = null;
|
|
||||||
|
|
||||||
ArrayList<Long> pairWiseDistances;
|
HashMap<String, ArrayList<VariantAnalysis>> analysisSets;
|
||||||
int[] pairWiseBoundries = {1, 10, 100, 1000, 10000, 1000000000};
|
|
||||||
AllelicVariant lastVariant = null;
|
|
||||||
|
|
||||||
//EnumMap<BaseUtils.BaseSubstitutionType, Integer> transition_transversion_counts[];
|
final String ALL_SNPS = "all";
|
||||||
|
final String SINGLETON_SNPS = "singletons";
|
||||||
|
final String TWOHIT_SNPS = "2plus_hit";
|
||||||
|
final String[] ALL_ANALYSIS_NAMES = { ALL_SNPS, SINGLETON_SNPS, TWOHIT_SNPS };
|
||||||
|
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
//transitions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, Math.log10(1e-10), 0.0, 0);
|
// setup the path to the analysis
|
||||||
//transversions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, Math.log10(1e-10), 0.0, 0);
|
if ( this.getToolkit().getArguments().outFileName != null ) {
|
||||||
transitions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
|
analysisFilenameBase = this.getToolkit().getArguments().outFileName + ".analysis.";
|
||||||
transversions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
|
}
|
||||||
pairWiseDistances = new ArrayList<Long>();
|
|
||||||
|
analysisSets = new HashMap<String, ArrayList<VariantAnalysis>>();
|
||||||
|
for ( String setName : ALL_ANALYSIS_NAMES ) {
|
||||||
|
analysisSets.put(setName, initializeAnalysisSet(setName));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//private int transition_transversion_bin(double het) {
|
private ArrayList<VariantAnalysis> getAnalysisSet(final String name) {
|
||||||
// return (int)Math.floor(het * N_TRANSITION_TRANVERSION_BINS);
|
return analysisSets.get(name);
|
||||||
//}
|
}
|
||||||
|
|
||||||
//private double transition_transversion_bin2het(int bin) {
|
private ArrayList<VariantAnalysis> initializeAnalysisSet(final String setName) {
|
||||||
// return (bin + 0.5) / N_TRANSITION_TRANVERSION_BINS;
|
ArrayList<VariantAnalysis> analyses = new ArrayList<VariantAnalysis>();
|
||||||
//}
|
|
||||||
|
//
|
||||||
|
// Add new analyzes here!
|
||||||
|
//
|
||||||
|
analyses.add(new VariantCounter());
|
||||||
|
analyses.add(new VariantDBCoverage("dbSNP"));
|
||||||
|
analyses.add(new TransitionTranversionAnalysis());
|
||||||
|
analyses.add(new PairwiseDistanceAnalysis());
|
||||||
|
analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold));
|
||||||
|
|
||||||
|
|
||||||
|
if ( printVariants ) analyses.add(new VariantMatcher("dbSNP"));
|
||||||
|
|
||||||
|
for ( VariantAnalysis analysis : analyses ) {
|
||||||
|
analysis.initialize(this, openAnalysisOutputStream(setName, analysis));
|
||||||
|
}
|
||||||
|
|
||||||
|
return analyses;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the filename of the analysis output file where output for an analysis with
|
||||||
|
* @param name
|
||||||
|
* @param params
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public String getAnalysisFilename(final String name, final List<String> params) {
|
||||||
|
if ( analysisFilenameBase == null )
|
||||||
|
return null;
|
||||||
|
else
|
||||||
|
return analysisFilenameBase + Utils.join(".", Utils.cons(name, params));
|
||||||
|
}
|
||||||
|
|
||||||
|
public PrintStream openAnalysisOutputStream(final String setName, VariantAnalysis analysis) {
|
||||||
|
final String filename = getAnalysisFilename(setName + "." + analysis.getName(), analysis.getParams());
|
||||||
|
if ( filename == null )
|
||||||
|
return out;
|
||||||
|
else {
|
||||||
|
File file = new File(filename);
|
||||||
|
try {
|
||||||
|
return new PrintStream(new FileOutputStream(file));
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
throw new StingException("Couldn't open analysis output file " + filename, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||||
nBasesCovered++;
|
// Iterate over each analysis, and update it
|
||||||
|
|
||||||
AllelicVariant dbsnp = (AllelicVariant)tracker.lookup("dbSNP", null);
|
|
||||||
AllelicVariant eval = (AllelicVariant)tracker.lookup("eval", null);
|
AllelicVariant eval = (AllelicVariant)tracker.lookup("eval", null);
|
||||||
|
if ( eval != null && eval.getVariationConfidence() < minDiscoveryQ )
|
||||||
|
eval = null;
|
||||||
|
|
||||||
if ( printVariants && ( eval != null || dbsnp != null ) ) {
|
updateAnalysisSet(ALL_SNPS, eval, tracker, ref, context);
|
||||||
String matchFlag = " ";
|
|
||||||
if ( eval != null && dbsnp != null ) matchFlag = "*** ";
|
|
||||||
if ( eval == null && dbsnp != null ) matchFlag = ">>> ";
|
|
||||||
if ( eval != null && dbsnp == null ) matchFlag = "<<< ";
|
|
||||||
|
|
||||||
System.out.printf("%sDBSNP: %s%n%sEVAL:%s%n",
|
if ( eval instanceof SNPCallFromGenotypes ) {
|
||||||
matchFlag, dbsnp,
|
SNPCallFromGenotypes call = (PooledEMSNPROD)eval;
|
||||||
matchFlag, eval);
|
int nVarGenotypes = call.nHetGenotypes() + call.nHomVarGenotypes();
|
||||||
|
//System.out.printf("%d variant genotypes at %s%n", nVarGenotypes, calls);
|
||||||
|
final String s = nVarGenotypes == 1 ? SINGLETON_SNPS : TWOHIT_SNPS;
|
||||||
|
updateAnalysisSet(s, eval, tracker, ref, context);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( eval != null ) {
|
|
||||||
//System.out.printf("GREP ME%n");
|
|
||||||
if ( ! eval.isSNP() || eval.getVariationConfidence() < minDiscoveryQ )
|
|
||||||
System.out.printf("We have a problem at %s: %b %f%n", eval, eval.isSNP(), eval.getVariationConfidence());
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( eval != null && eval.isSNP() && eval.getVariationConfidence() >= minDiscoveryQ ) {
|
|
||||||
//System.out.printf("%s has: %nDBSNP: %s%nEVAL:%s%n", context.getLocation(), dbsnp, eval);
|
|
||||||
//System.out.printf("N snps %d = %s%n", ++nSNPs, eval.getLocation());
|
|
||||||
|
|
||||||
updateTransitionTransversion(eval, ref, context);
|
|
||||||
|
|
||||||
if (lastVariant != null) updatePairwiseDistances(eval, lastVariant);
|
|
||||||
lastVariant = eval;
|
|
||||||
//updateHapMapRate(dbsnp, eval, ref, context);
|
|
||||||
}
|
|
||||||
updateVariantDBCoverage(dbsnp, eval, ref, context);
|
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void updatePairwiseDistances(AllelicVariant eval, AllelicVariant lastVariant) {
|
|
||||||
GenomeLoc eL = eval.getLocation();
|
public void updateAnalysisSet(final String analysisSetName, AllelicVariant eval,
|
||||||
GenomeLoc lvL = lastVariant.getLocation();
|
RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||||
if (eL.getContigIndex() == lvL.getContigIndex()) {
|
// Iterate over each analysis, and update it
|
||||||
long d = eL.distance(lvL);
|
for ( VariantAnalysis analysis : getAnalysisSet(analysisSetName) ) {
|
||||||
pairWiseDistances.add(d);
|
String s = analysis.update(eval, tracker, ref, context);
|
||||||
out.printf("# Pairwise-distance %d %s %s%n", d, eL, lvL);
|
if ( s != null ) analysis.getPrintStream().println(s);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void updateTransitionTransversion(AllelicVariant dbsnp, char ref, LocusContext context) {
|
|
||||||
char refBase = dbsnp.getRefSnpFWD();
|
|
||||||
char altBase = dbsnp.getAltSnpFWD();
|
|
||||||
//System.out.printf("%c %c%n", refBase, altBase);
|
|
||||||
//int i = transition_transversion_bin(dbsnp.getHeterozygosity());
|
|
||||||
//System.out.printf("MAF = %f => %d%n", dbsnp.getMAF(), i);
|
|
||||||
//EnumMap<BaseUtils.BaseSubstitutionType, Integer> bin = transition_transversion_counts[i];
|
|
||||||
|
|
||||||
BaseUtils.BaseSubstitutionType subType = BaseUtils.SNPSubstitutionType(refBase, altBase);
|
|
||||||
Histogram<Integer> h = subType == BaseUtils.BaseSubstitutionType.TRANSITION ? transitions : transversions;
|
|
||||||
double het = dbsnp.getHeterozygosity();
|
|
||||||
h.setBin(het, h.getBin(het) + 1);
|
|
||||||
//int sit = bin.get(BaseUtils.BaseSubstitutionType.TRANSITION);
|
|
||||||
//int ver = bin.get(BaseUtils.BaseSubstitutionType.TRANSVERSION);
|
|
||||||
//System.out.printf("%s %.2f %s%n", subType, dbsnp.getHeterozygosity(), h.x2bin(logHet), dbsnp.toString());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private void updateVariantDBCoverage(AllelicVariant dbsnp, AllelicVariant eval, char ref, LocusContext context) {
|
|
||||||
// There are four cases here:
|
|
||||||
dbSNPStats.inc(dbsnp != null, eval != null);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Given result of map function
|
// Given result of map function
|
||||||
public Integer reduceInit() { return 0; }
|
public Integer reduceInit() { return 0; }
|
||||||
public Integer reduce(Integer value, Integer sum) {
|
public Integer reduce(Integer value, Integer sum) {
|
||||||
|
|
@ -143,54 +139,24 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void onTraversalDone(Integer result) {
|
public void onTraversalDone(Integer result) {
|
||||||
int nTransitions = 0;
|
for ( String analysisSetName : ALL_ANALYSIS_NAMES ) {
|
||||||
int nTransversions = 0;
|
printAnalysisSet(analysisSetName);
|
||||||
|
|
||||||
StringBuilder s = new StringBuilder();
|
|
||||||
for ( int i = 0; i < N_TRANSITION_TRANVERSION_BINS; i++ ) {
|
|
||||||
//double avHet = Math.pow(10, transitions.bin2x(i));
|
|
||||||
double avHet = transitions.bin2x(i);
|
|
||||||
if ( avHet > 0.5 ) break;
|
|
||||||
|
|
||||||
int sit = transitions.getBin(i);
|
|
||||||
int ver = transversions.getBin(i);
|
|
||||||
nTransitions += sit;
|
|
||||||
nTransversions += ver;
|
|
||||||
double ratio = (float)sit/ver;
|
|
||||||
s.append(String.format("%.2f %d %d %f%n", avHet, sit, ver, ratio));
|
|
||||||
}
|
}
|
||||||
out.printf("# n bases covered: %d%n", nBasesCovered);
|
}
|
||||||
out.printf("# sites: %d%n", nSNPs);
|
|
||||||
out.printf("# variant rate: %.5f confident variants per base%n", nSNPs / (1.0 * nBasesCovered));
|
|
||||||
out.printf("# variant rate: 1 / %d confident variants per base%n", nBasesCovered / nSNPs);
|
|
||||||
out.printf("# transitions: %d%n", nTransitions);
|
|
||||||
out.printf("# transversions: %d%n", nTransversions);
|
|
||||||
out.printf("# ratio: %.2f%n", nTransitions / (1.0 * nTransversions));
|
|
||||||
|
|
||||||
// dbSNP stats
|
private void printAnalysisSet( final String analysisSetName ) {
|
||||||
out.println(dbSNPStats.toSingleLineString("#"));
|
Date now = new Date();
|
||||||
out.print(dbSNPStats.toMultiLineString("#"));
|
for ( VariantAnalysis analysis : getAnalysisSet(analysisSetName) ) {
|
||||||
|
PrintStream stream = analysis.getPrintStream(); // getAnalysisOutputStream(analysisSetName + "." + analysis.getName(), analysis.getParams());
|
||||||
int[] pairCounts = new int[pairWiseBoundries.length];
|
stream.printf("%s%s%n", COMMENT_STRING, Utils.dupString('-', 78));
|
||||||
Arrays.fill(pairCounts, 0);
|
stream.printf("%sAnalysis set %s%n", COMMENT_STRING, analysisSetName);
|
||||||
for ( long dist : pairWiseDistances ) {
|
stream.printf("%sAnalysis name %s%n", COMMENT_STRING, analysis.getName());
|
||||||
boolean done = false;
|
stream.printf("%sAnalysis params %s%n", COMMENT_STRING, Utils.join(" ", analysis.getParams()));
|
||||||
for ( int i = 0; i < pairWiseBoundries.length && ! done ; i++ ) {
|
stream.printf("%sAnalysis class %s%n", COMMENT_STRING, analysis );
|
||||||
int maxDist = pairWiseBoundries[i];
|
stream.printf("%sAnalysis time %s%n", COMMENT_STRING, now );
|
||||||
if ( dist < maxDist ) {
|
for ( String line : analysis.done()) {
|
||||||
pairCounts[i]++;
|
stream.printf("%s %s%n", COMMENT_STRING, line);
|
||||||
done = true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
out.printf("# snps counted for pairwise distance: %d%n", pairWiseDistances.size());
|
|
||||||
for ( int i = 0; i < pairWiseBoundries.length; i++ ) {
|
|
||||||
int maxDist = pairWiseBoundries[i];
|
|
||||||
int count = pairCounts[i];
|
|
||||||
out.printf("# snps within %d bp: %d%n", maxDist, count);
|
|
||||||
}
|
|
||||||
|
|
||||||
out.print(s);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,34 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.LocusContext;
|
||||||
|
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
public class VariantMatcher extends BasicVariantAnalysis {
|
||||||
|
String dbName;
|
||||||
|
|
||||||
|
public VariantMatcher(final String name) {
|
||||||
|
super("variant_matches");
|
||||||
|
dbName = name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||||
|
String r = null;
|
||||||
|
AllelicVariant db = (AllelicVariant)tracker.lookup(dbName, null);
|
||||||
|
|
||||||
|
if ( eval != null || db != null ) {
|
||||||
|
String matchFlag = " ";
|
||||||
|
if ( eval != null && db != null ) matchFlag = "*** ";
|
||||||
|
if ( eval == null && db != null ) matchFlag = ">>> ";
|
||||||
|
if ( eval != null && db == null ) matchFlag = "<<< ";
|
||||||
|
|
||||||
|
r = String.format("%s %s: %s <=> EVAL: %s", dbName, matchFlag, db, eval);
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -84,6 +84,35 @@ public class MathUtils {
|
||||||
//return (new cern.jet.random.Binomial(n, p, cern.jet.random.engine.RandomEngine.makeDefault())).pdf(k);
|
//return (new cern.jet.random.Binomial(n, p, cern.jet.random.engine.RandomEngine.makeDefault())).pdf(k);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Computes a multinomial. This is computed using the formula
|
||||||
|
*
|
||||||
|
* M(x1,x2,...,xk; n) = [ n! / (x1! x2! ... xk!) ]
|
||||||
|
*
|
||||||
|
* where xi represents the number of times outcome i was observed, n is the number of total observations.
|
||||||
|
* In this implementation, the value of n is inferred as the sum over i of xi.
|
||||||
|
*
|
||||||
|
* @param x an int[] of counts, where each element represents the number of times a certain outcome was observed
|
||||||
|
* @return the multinomial of the specified configuration.
|
||||||
|
*/
|
||||||
|
public static double multinomial(int[] x) {
|
||||||
|
// In order to avoid overflow in computing large factorials in the multinomial
|
||||||
|
// coefficient, we split the calculation up into the product of a bunch of
|
||||||
|
// binomial coefficients.
|
||||||
|
|
||||||
|
double multinomialCoefficient = 1.0;
|
||||||
|
|
||||||
|
for (int i = 0; i < x.length; i++) {
|
||||||
|
int n = 0;
|
||||||
|
for (int j = 0; j <= i; j++) { n += x[j]; }
|
||||||
|
|
||||||
|
double multinomialTerm = Arithmetic.binomial(n, x[i]);
|
||||||
|
multinomialCoefficient *= multinomialTerm;
|
||||||
|
}
|
||||||
|
|
||||||
|
return multinomialCoefficient;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Computes a multinomial probability. This is computed using the formula
|
* Computes a multinomial probability. This is computed using the formula
|
||||||
*
|
*
|
||||||
|
|
@ -101,16 +130,7 @@ public class MathUtils {
|
||||||
// In order to avoid overflow in computing large factorials in the multinomial
|
// In order to avoid overflow in computing large factorials in the multinomial
|
||||||
// coefficient, we split the calculation up into the product of a bunch of
|
// coefficient, we split the calculation up into the product of a bunch of
|
||||||
// binomial coefficients.
|
// binomial coefficients.
|
||||||
|
double multinomialCoefficient = multinomial(x);
|
||||||
double multinomialCoefficient = 1.0;
|
|
||||||
|
|
||||||
for (int i = 0; i < x.length; i++) {
|
|
||||||
int n = 0;
|
|
||||||
for (int j = 0; j <= i; j++) { n += x[j]; }
|
|
||||||
|
|
||||||
double multinomialTerm = Arithmetic.binomial(n, x[i]);
|
|
||||||
multinomialCoefficient *= multinomialTerm;
|
|
||||||
}
|
|
||||||
|
|
||||||
double probs = 1.0, totalprob = 0.0;
|
double probs = 1.0, totalprob = 0.0;
|
||||||
for (int obsCountsIndex = 0; obsCountsIndex < x.length; obsCountsIndex++) {
|
for (int obsCountsIndex = 0; obsCountsIndex < x.length; obsCountsIndex++) {
|
||||||
|
|
|
||||||
|
|
@ -43,6 +43,13 @@ public class Utils {
|
||||||
throw new RuntimeException(msg);
|
throw new RuntimeException(msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static <T> List<T> cons(final T elt, final List<T> l) {
|
||||||
|
List<T> l2 = new ArrayList<T>();
|
||||||
|
l2.add(elt);
|
||||||
|
if ( l != null ) l2.addAll(l);
|
||||||
|
return l2;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* pretty print the warning message supplied
|
* pretty print the warning message supplied
|
||||||
* @param message the message
|
* @param message the message
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||||
import org.broadinstitute.sting.gatk.refdata.TabularROD;
|
import org.broadinstitute.sting.gatk.refdata.TabularROD;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RODIterator;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
|
|
@ -56,7 +57,7 @@ public class IteratorPoolTest extends BaseTest {
|
||||||
@Test
|
@Test
|
||||||
public void testCreateSingleIterator() {
|
public void testCreateSingleIterator() {
|
||||||
IteratorPool iteratorPool = new IteratorPool(rod);
|
IteratorPool iteratorPool = new IteratorPool(rod);
|
||||||
ReferenceOrderedData.RODIterator iterator = (ReferenceOrderedData.RODIterator)iteratorPool.iterator( testSite1 );
|
RODIterator iterator = (RODIterator)iteratorPool.iterator( testSite1 );
|
||||||
|
|
||||||
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
|
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
|
||||||
Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators());
|
Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators());
|
||||||
|
|
@ -77,10 +78,10 @@ public class IteratorPoolTest extends BaseTest {
|
||||||
@Test
|
@Test
|
||||||
public void testCreateMultipleIterators() {
|
public void testCreateMultipleIterators() {
|
||||||
IteratorPool iteratorPool = new IteratorPool(rod);
|
IteratorPool iteratorPool = new IteratorPool(rod);
|
||||||
ReferenceOrderedData.RODIterator iterator1 = (ReferenceOrderedData.RODIterator)iteratorPool.iterator( testSite1 );
|
RODIterator iterator1 = (RODIterator)iteratorPool.iterator( testSite1 );
|
||||||
|
|
||||||
// Create a new iterator at position 2.
|
// Create a new iterator at position 2.
|
||||||
ReferenceOrderedData.RODIterator iterator2 = iteratorPool.iterator( testSite2 );
|
RODIterator iterator2 = iteratorPool.iterator( testSite2 );
|
||||||
|
|
||||||
Assert.assertEquals("Number of iterators in the pool is incorrect", 2, iteratorPool.numIterators());
|
Assert.assertEquals("Number of iterators in the pool is incorrect", 2, iteratorPool.numIterators());
|
||||||
Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators());
|
Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators());
|
||||||
|
|
@ -127,7 +128,7 @@ public class IteratorPoolTest extends BaseTest {
|
||||||
@Test
|
@Test
|
||||||
public void testIteratorConservation() {
|
public void testIteratorConservation() {
|
||||||
IteratorPool iteratorPool = new IteratorPool(rod);
|
IteratorPool iteratorPool = new IteratorPool(rod);
|
||||||
ReferenceOrderedData.RODIterator iterator = (ReferenceOrderedData.RODIterator)iteratorPool.iterator( testSite1 );
|
RODIterator iterator = (RODIterator)iteratorPool.iterator( testSite1 );
|
||||||
|
|
||||||
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
|
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
|
||||||
Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators());
|
Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators());
|
||||||
|
|
@ -162,7 +163,7 @@ public class IteratorPoolTest extends BaseTest {
|
||||||
@Test
|
@Test
|
||||||
public void testIteratorCreation() {
|
public void testIteratorCreation() {
|
||||||
IteratorPool iteratorPool = new IteratorPool(rod);
|
IteratorPool iteratorPool = new IteratorPool(rod);
|
||||||
ReferenceOrderedData.RODIterator iterator = (ReferenceOrderedData.RODIterator)iteratorPool.iterator( testSite3 );
|
RODIterator iterator = (RODIterator)iteratorPool.iterator( testSite3 );
|
||||||
|
|
||||||
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
|
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
|
||||||
Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators());
|
Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators());
|
||||||
|
|
|
||||||
|
|
@ -26,7 +26,7 @@ import java.util.ArrayList;
|
||||||
public class TabularRODTest extends BaseTest {
|
public class TabularRODTest extends BaseTest {
|
||||||
private static FastaSequenceFile2 seq;
|
private static FastaSequenceFile2 seq;
|
||||||
private ReferenceOrderedData ROD;
|
private ReferenceOrderedData ROD;
|
||||||
private ReferenceOrderedData.RODIterator iter;
|
private RODIterator iter;
|
||||||
|
|
||||||
|
|
||||||
@BeforeClass
|
@BeforeClass
|
||||||
|
|
@ -113,7 +113,7 @@ public class TabularRODTest extends BaseTest {
|
||||||
public void testDelim1() {
|
public void testDelim1() {
|
||||||
File file2 = new File(testDir + "TabularDataTest2.dat");
|
File file2 = new File(testDir + "TabularDataTest2.dat");
|
||||||
ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", file2, TabularROD.class);
|
ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", file2, TabularROD.class);
|
||||||
ReferenceOrderedData.RODIterator iter_commas = ROD_commas.iterator();
|
RODIterator iter_commas = ROD_commas.iterator();
|
||||||
|
|
||||||
logger.warn("Executing testDelim1");
|
logger.warn("Executing testDelim1");
|
||||||
TabularROD one2 = (TabularROD)iter_commas.next();
|
TabularROD one2 = (TabularROD)iter_commas.next();
|
||||||
|
|
@ -130,7 +130,7 @@ public class TabularRODTest extends BaseTest {
|
||||||
TabularROD.setDelimiter(",",",");
|
TabularROD.setDelimiter(",",",");
|
||||||
File file2 = new File(testDir + "TabularDataTest2.dat");
|
File file2 = new File(testDir + "TabularDataTest2.dat");
|
||||||
ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", file2, TabularROD.class);
|
ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", file2, TabularROD.class);
|
||||||
ReferenceOrderedData.RODIterator iter_commas = ROD_commas.iterator();
|
RODIterator iter_commas = ROD_commas.iterator();
|
||||||
|
|
||||||
logger.warn("Executing testDelim1");
|
logger.warn("Executing testDelim1");
|
||||||
TabularROD one2 = (TabularROD)iter_commas.next();
|
TabularROD one2 = (TabularROD)iter_commas.next();
|
||||||
|
|
@ -173,7 +173,7 @@ public class TabularRODTest extends BaseTest {
|
||||||
out.println(row.toString());
|
out.println(row.toString());
|
||||||
|
|
||||||
ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", outputFile, TabularROD.class);
|
ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", outputFile, TabularROD.class);
|
||||||
ReferenceOrderedData.RODIterator iter_commas = ROD_commas.iterator();
|
RODIterator iter_commas = ROD_commas.iterator();
|
||||||
|
|
||||||
TabularROD one = (TabularROD)iter_commas.next();
|
TabularROD one = (TabularROD)iter_commas.next();
|
||||||
assertTrue(one.size() == 3);
|
assertTrue(one.size() == 3);
|
||||||
|
|
|
||||||
|
|
@ -63,6 +63,7 @@ def main():
|
||||||
if filter(lambda x: x <> 0, jobids) <> []:
|
if filter(lambda x: x <> 0, jobids) <> []:
|
||||||
# there's still work to do
|
# there's still work to do
|
||||||
sys.exit('Stopping. Please rerun this program when the farm jobs are complete: ' + str(jobids))
|
sys.exit('Stopping. Please rerun this program when the farm jobs are complete: ' + str(jobids))
|
||||||
|
# TODO: Should add a wait here for all farm jobs to finish...
|
||||||
print 'gelis', gelis
|
print 'gelis', gelis
|
||||||
print 'jobids', jobids
|
print 'jobids', jobids
|
||||||
else:
|
else:
|
||||||
|
|
@ -76,8 +77,11 @@ def main():
|
||||||
dbsnpFile = geli2dbsnpFile(geli)
|
dbsnpFile = geli2dbsnpFile(geli)
|
||||||
if not os.path.exists(dbsnpFile):
|
if not os.path.exists(dbsnpFile):
|
||||||
dbsnpCmd = picard_utils.CollectDbSnpMatchesCmd(geli, dbsnpFile, OPTIONS.lod)
|
dbsnpCmd = picard_utils.CollectDbSnpMatchesCmd(geli, dbsnpFile, OPTIONS.lod)
|
||||||
|
if jobid == 0: jobid = None
|
||||||
farm_commands.cmd(dbsnpCmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid)
|
farm_commands.cmd(dbsnpCmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid)
|
||||||
|
|
||||||
|
# TODO: Should add a wait here for all farm jobs to finish...
|
||||||
|
|
||||||
# read in the dbSNP tracks
|
# read in the dbSNP tracks
|
||||||
nTotalSnps = 0
|
nTotalSnps = 0
|
||||||
nNovelSnps = 0
|
nNovelSnps = 0
|
||||||
|
|
@ -99,11 +103,12 @@ def main():
|
||||||
jobid = None
|
jobid = None
|
||||||
variantsOut = map( lambda geli: os.path.split(geli)[1] + '.calls', gelis)
|
variantsOut = map( lambda geli: os.path.split(geli)[1] + '.calls', gelis)
|
||||||
for geli, variantOut in zip(gelis, variantsOut):
|
for geli, variantOut in zip(gelis, variantsOut):
|
||||||
|
name = os.path.split(geli)[1]
|
||||||
if not os.path.exists(variantOut):
|
if not os.path.exists(variantOut):
|
||||||
cmd = ("GeliToText.jar I=%s | awk '$7 > %f' > %s" % ( geli, OPTIONS.lod, variantOut) )
|
cmd = ("GeliToText.jar I=%s | awk '$1 !~ \"@\" && $1 !~ \"#Sequence\" && $0 !~ \"GeliToText\"' | awk '$7 > %f {print \"%s\" $0}' > %s" % ( geli, OPTIONS.lod, name, variantOut) )
|
||||||
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry)
|
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry)
|
||||||
|
|
||||||
cmd = ("cat %s | awk '$1 !~ \"@\" && $1 !~ \"#Sequence\" && $0 !~ \"GeliToText\"' | sort -k 1 -k 2 -n > tmp.calls" % ( ' '.join(variantsOut) ) )
|
cmd = ("cat %s | sort -k 1 -k 2 -n > tmp.calls" % ( ' '.join(variantsOut) ) )
|
||||||
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid)
|
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid)
|
||||||
|
|
||||||
sortedCallFile = 'all.sorted.calls'
|
sortedCallFile = 'all.sorted.calls'
|
||||||
|
|
@ -113,14 +118,14 @@ def main():
|
||||||
sortedCalls = [line.split() for line in open(sortedCallFile)]
|
sortedCalls = [line.split() for line in open(sortedCallFile)]
|
||||||
aggregratedCalls = picard_utils.aggregateGeliCalls(sortedCalls)
|
aggregratedCalls = picard_utils.aggregateGeliCalls(sortedCalls)
|
||||||
|
|
||||||
print >> outputFile, 'loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom'
|
print >> outputFile, 'loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom individuals'
|
||||||
|
|
||||||
for snp in map( lambda x: picard_utils.aggregatedGeliCalls2SNP(x, nIndividuals), aggregratedCalls ):
|
for snp in map( lambda x: picard_utils.aggregatedGeliCalls2SNP(x, nIndividuals), aggregratedCalls ):
|
||||||
if snp == None: continue # ignore bad calls
|
if snp == None: continue # ignore bad calls
|
||||||
#print snp
|
#print snp
|
||||||
#sharedCalls = list(sharedCallsGroup)
|
#sharedCalls = list(sharedCallsGroup)
|
||||||
#genotype = list(sharedCalls[0][5])
|
#genotype = list(sharedCalls[0][5])
|
||||||
print >> outputFile, '%s %s %s %.6f -420.0 -420.0 0.000000 100.0 %d %d %d %d' % (snp.loc, snp.ref, snp.alt(), snp.q(), nIndividuals, snp.nRefGenotypes(), snp.nHetGenotypes(), snp.nHomVarGenotypes())
|
print >> outputFile, '%s %s %s %.6f -420.0 -420.0 0.000000 100.0 %d %d %d %d NA' % (snp.loc, snp.ref, snp.alt(), snp.q(), nIndividuals, snp.nRefGenotypes(), snp.nHetGenotypes(), snp.nHomVarGenotypes())
|
||||||
outputFile.close()
|
outputFile.close()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -24,7 +24,7 @@ def cmd(cmd_str_from_user, farm_queue=False, output_head=None, just_print_comman
|
||||||
if waitID <> None:
|
if waitID <> None:
|
||||||
cmd_str += " -w \"done(%s)\"" % (str(waitID))
|
cmd_str += " -w \"done(%s)\"" % (str(waitID))
|
||||||
|
|
||||||
cmd_str += " "+cmd_str_from_user
|
cmd_str += " \""+cmd_str_from_user + "\""
|
||||||
|
|
||||||
print ">>> Farming via "+cmd_str
|
print ">>> Farming via "+cmd_str
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -55,13 +55,22 @@ def genotypes2allelefrequencies(ref, genotypes, nIndividuals = -1):
|
||||||
nAltChroms = nChroms - nRefChroms
|
nAltChroms = nChroms - nRefChroms
|
||||||
p = float(nRefChroms) / nChroms
|
p = float(nRefChroms) / nChroms
|
||||||
q = float(nAltChroms) / nChroms
|
q = float(nAltChroms) / nChroms
|
||||||
|
|
||||||
|
#print 'genotypes', genotypes
|
||||||
|
#print 'alleles', alleles
|
||||||
|
#print 'nChroms', nChroms
|
||||||
|
#print 'nCalledChroms', nCalledChroms
|
||||||
|
#print 'nMissingChroms', nMissingChroms
|
||||||
|
#print 'nRefChroms', nRefChroms
|
||||||
|
#print 'nAltChroms', nAltChroms
|
||||||
|
#print 'p, q', p, q
|
||||||
|
|
||||||
assert p + q == 1
|
assert p + q == 1
|
||||||
|
|
||||||
return [p, q, n]
|
return [p, q, n]
|
||||||
|
|
||||||
class PicardSNP:
|
class PicardSNP:
|
||||||
def __init__( self, loc, ref, polymorphism, heterozygosity, allelefrequencies, genotypes):
|
def __init__( self, loc, ref, polymorphism, heterozygosity, allelefrequencies, genotypes, sources):
|
||||||
self.loc = loc
|
self.loc = loc
|
||||||
self.ref = ref
|
self.ref = ref
|
||||||
self.polymorphism = polymorphism
|
self.polymorphism = polymorphism
|
||||||
|
|
@ -69,6 +78,7 @@ class PicardSNP:
|
||||||
self.nIndividuals = allelefrequencies[2]
|
self.nIndividuals = allelefrequencies[2]
|
||||||
self.allelefrequencies = allelefrequencies
|
self.allelefrequencies = allelefrequencies
|
||||||
self.genotypes = genotypes
|
self.genotypes = genotypes
|
||||||
|
self.sources = sources
|
||||||
|
|
||||||
def refGenotype(self):
|
def refGenotype(self):
|
||||||
return self.ref + self.ref
|
return self.ref + self.ref
|
||||||
|
|
@ -92,7 +102,7 @@ class PicardSNP:
|
||||||
return self.allelefrequencies[1]
|
return self.allelefrequencies[1]
|
||||||
|
|
||||||
def countGenotype(self, genotype):
|
def countGenotype(self, genotype):
|
||||||
r = len(filter( lambda x: x == genotype, self.genotypes ))
|
r = len(filter( lambda x: sorted(x) == sorted(genotype), self.genotypes ))
|
||||||
#print 'countGenotype', genotype, self.genotypes, r
|
#print 'countGenotype', genotype, self.genotypes, r
|
||||||
return r
|
return r
|
||||||
|
|
||||||
|
|
@ -105,7 +115,6 @@ class PicardSNP:
|
||||||
def nHomVarGenotypes(self):
|
def nHomVarGenotypes(self):
|
||||||
return self.countGenotype(self.homVarGenotype())
|
return self.countGenotype(self.homVarGenotype())
|
||||||
|
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return '%s %s %s %s %s %s' % ( self.loc, self.ref, str(self.polymorphism), str(self.het()), str(self.allelefrequencies), str(self.genotypes))
|
return '%s %s %s %s %s %s' % ( self.loc, self.ref, str(self.polymorphism), str(self.het()), str(self.allelefrequencies), str(self.genotypes))
|
||||||
|
|
||||||
|
|
@ -132,7 +141,7 @@ def aggregatedGeliCalls2SNP( geliCallsAtSite, nIndividuals ):
|
||||||
#print 'polymorphism', polymorphism
|
#print 'polymorphism', polymorphism
|
||||||
genotype = list(geliCallsAtSite[1][0][5])
|
genotype = list(geliCallsAtSite[1][0][5])
|
||||||
|
|
||||||
return PicardSNP(loc, refBase, polymorphism, genotypes2heterozygosity(genotypes, nIndividuals), genotypes2allelefrequencies(refBase, genotypes, nIndividuals), genotypes)
|
return PicardSNP(loc, refBase, polymorphism, genotypes2heterozygosity(genotypes, nIndividuals), genotypes2allelefrequencies(refBase, genotypes, nIndividuals), genotypes, [])
|
||||||
|
|
||||||
#return '%s %s %s 0.002747 -411.622578 -420.661738 0.000000 9.039160 364.000000 %d 1 0' % (loc, genotype[0], genotype[1], len(geliCallsAtSite))
|
#return '%s %s %s 0.002747 -411.622578 -420.661738 0.000000 9.039160 364.000000 %d 1 0' % (loc, genotype[0], genotype[1], len(geliCallsAtSite))
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue