major restructuring of generalized variant analysis framework. Now trivally easy to add additional analyses. Easy partitioning of all analyses by features, such as singleton status. Now has transition/transversional bias, counting, dbSNP coverage, HWE violation, selecting of variants by presence/absense in dbs. Also restructured the ROD system to make it easier to add tracks. Also, added the interval track -- if you provide an interval list, then the system autoatmically makese this available to you as a bound rod -- you can always find out where you are in the interval at every site. Python scripts improved to handle more merging, etc, into population snps.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@918 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
400399f1b8
commit
819862e04e
|
|
@ -8,12 +8,11 @@ import org.apache.log4j.Logger;
|
|||
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.RODIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.IntervalRodIterator;
|
||||
import org.broadinstitute.sting.gatk.traversals.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.ArgumentException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
|
@ -77,6 +76,8 @@ public class GenomeAnalysisEngine {
|
|||
bindConvenienceRods("hapmap", "HapMapAlleleFrequencies", argCollection.HAPMAPFile);
|
||||
if (argCollection.HAPMAPChipFile != null)
|
||||
bindConvenienceRods("hapmap-chip", "GFF", argCollection.HAPMAPChipFile);
|
||||
if ( argCollection.intervals != null )
|
||||
bindConvenienceRods("interval", "Intervals", argCollection.intervals);
|
||||
|
||||
// parse out the rod bindings
|
||||
ReferenceOrderedData.parseBindings(logger, argCollection.RODBindings, rods);
|
||||
|
|
@ -114,7 +115,8 @@ public class GenomeAnalysisEngine {
|
|||
genericEngineSetup(strictness);
|
||||
|
||||
// parse out any genomic location they've provided
|
||||
List<GenomeLoc> locationsList = setupIntervalRegion();
|
||||
//List<GenomeLoc> locationsList = setupIntervalRegion();
|
||||
List<GenomeLoc> locationsList = engine.getLocations();
|
||||
GenomeLocSortedSet locs = null;
|
||||
if (locationsList != null)
|
||||
locs = GenomeLocSortedSet.createSetFromList(locationsList);
|
||||
|
|
@ -202,7 +204,7 @@ public class GenomeAnalysisEngine {
|
|||
|
||||
// we default interval files over the genome region string
|
||||
if (argCollection.intervals != null) {
|
||||
engine.setLocation(setupIntervalRegion());
|
||||
engine.setLocation(parseIntervalRegion(argCollection.intervals, false));
|
||||
}
|
||||
|
||||
engine.setReadFilters(new Reads(argCollection));
|
||||
|
|
@ -217,15 +219,15 @@ public class GenomeAnalysisEngine {
|
|||
*
|
||||
* @return a list of genomeLoc representing the interval file
|
||||
*/
|
||||
private List<GenomeLoc> setupIntervalRegion() {
|
||||
public static List<GenomeLoc> parseIntervalRegion(final String intervalsString, boolean quiet ) {
|
||||
List<GenomeLoc> locs = null;
|
||||
if (argCollection.intervals != null) {
|
||||
if (new File(argCollection.intervals).exists()) {
|
||||
logger.info("Intervals argument specifies a file. Loading intervals from file.");
|
||||
locs = GenomeLoc.IntervalFileToList(argCollection.intervals);
|
||||
if ( intervalsString != null) {
|
||||
if (new File(intervalsString).exists()) {
|
||||
if (! quiet) logger.info("Intervals argument specifies a file. Loading intervals from file.");
|
||||
locs = GenomeLoc.IntervalFileToList(intervalsString);
|
||||
} else {
|
||||
logger.info("Intervals argument does not specify a file. Trying to parse it as a simple string.");
|
||||
locs = GenomeLoc.parseGenomeLocs(argCollection.intervals);
|
||||
if (! quiet) logger.info("Intervals argument does not specify a file. Trying to parse it as a simple string.");
|
||||
locs = GenomeLoc.parseGenomeLocs(intervalsString);
|
||||
}
|
||||
}
|
||||
return locs;
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.dataSources.providers;
|
|||
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RODIterator;
|
||||
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
|
|
@ -43,7 +44,7 @@ public class ReferenceOrderedView implements View {
|
|||
public ReferenceOrderedView( ShardDataProvider provider ) {
|
||||
this.provider = provider;
|
||||
for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() )
|
||||
states.add( new ReferenceOrderedDataState( dataSource, (ReferenceOrderedData.RODIterator)dataSource.seek(provider.getShard()) ) );
|
||||
states.add( new ReferenceOrderedDataState( dataSource, (RODIterator)dataSource.seek(provider.getShard()) ) );
|
||||
|
||||
provider.register(this);
|
||||
}
|
||||
|
|
@ -78,9 +79,9 @@ public class ReferenceOrderedView implements View {
|
|||
*/
|
||||
private class ReferenceOrderedDataState {
|
||||
public final ReferenceOrderedDataSource dataSource;
|
||||
public final ReferenceOrderedData.RODIterator iterator;
|
||||
public final RODIterator iterator;
|
||||
|
||||
public ReferenceOrderedDataState( ReferenceOrderedDataSource dataSource, ReferenceOrderedData.RODIterator iterator ) {
|
||||
public ReferenceOrderedDataState( ReferenceOrderedDataSource dataSource, RODIterator iterator ) {
|
||||
this.dataSource = dataSource;
|
||||
this.iterator = iterator;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
|
|||
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.RODIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
|
|
@ -30,12 +31,12 @@ class IteratorPool {
|
|||
/**
|
||||
* All iterators of this reference-ordered data.
|
||||
*/
|
||||
private List<ReferenceOrderedData.RODIterator> allIterators = new ArrayList<ReferenceOrderedData.RODIterator>();
|
||||
private List<RODIterator> allIterators = new ArrayList<RODIterator>();
|
||||
|
||||
/**
|
||||
* All iterators that are not currently in service.
|
||||
*/
|
||||
private List<ReferenceOrderedData.RODIterator> availableIterators = new ArrayList<ReferenceOrderedData.RODIterator>();
|
||||
private List<RODIterator> availableIterators = new ArrayList<RODIterator>();
|
||||
|
||||
/**
|
||||
* Create a new iterator pool given the current ROD.
|
||||
|
|
@ -50,10 +51,10 @@ class IteratorPool {
|
|||
* @param position Target position for the iterator.
|
||||
* @return
|
||||
*/
|
||||
public ReferenceOrderedData.RODIterator iterator( GenomeLoc position ) {
|
||||
public RODIterator iterator( GenomeLoc position ) {
|
||||
// Grab the first iterator in the list whose position is before the requested position.
|
||||
ReferenceOrderedData.RODIterator selectedIterator = null;
|
||||
for( ReferenceOrderedData.RODIterator iterator: availableIterators ) {
|
||||
RODIterator selectedIterator = null;
|
||||
for( RODIterator iterator: availableIterators ) {
|
||||
if( (iterator.position() == null && iterator.hasNext()) ||
|
||||
(iterator.position() != null && iterator.position().isBefore(position)) ) {
|
||||
selectedIterator = iterator;
|
||||
|
|
@ -79,7 +80,7 @@ class IteratorPool {
|
|||
* Close the given iterator, returning it to the pool.
|
||||
* @param iterator Iterator to return to the pool.
|
||||
*/
|
||||
public void close( ReferenceOrderedData.RODIterator iterator ) {
|
||||
public void close( RODIterator iterator ) {
|
||||
if( !allIterators.contains(iterator) )
|
||||
throw new StingException("Iterator does not belong to the given pool.");
|
||||
availableIterators.add(iterator);
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
|
|||
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.RODIterator;
|
||||
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
|
@ -59,7 +60,7 @@ public class ReferenceOrderedDataSource implements SimpleDataSource {
|
|||
* @return Iterator through the data.
|
||||
*/
|
||||
public Iterator seek( Shard shard ) {
|
||||
ReferenceOrderedData.RODIterator iterator = iteratorPool.iterator(shard.getGenomeLoc());
|
||||
RODIterator iterator = iteratorPool.iterator(shard.getGenomeLoc());
|
||||
return iterator;
|
||||
}
|
||||
|
||||
|
|
@ -67,7 +68,7 @@ public class ReferenceOrderedDataSource implements SimpleDataSource {
|
|||
* Close the specified iterator, returning it to the pool.
|
||||
* @param iterator Iterator to close.
|
||||
*/
|
||||
public void close( ReferenceOrderedData.RODIterator iterator ) {
|
||||
public void close( RODIterator iterator ) {
|
||||
this.iteratorPool.close(iterator);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,41 @@
|
|||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
public class IntervalRod extends BasicReferenceOrderedDatum {
|
||||
private GenomeLoc loc = null;
|
||||
|
||||
public IntervalRod(String name) {
|
||||
super(name);
|
||||
}
|
||||
|
||||
public IntervalRod(String name, GenomeLoc loc) {
|
||||
super(name);
|
||||
this.loc = loc;
|
||||
}
|
||||
|
||||
public GenomeLoc getLocation() { return loc; }
|
||||
|
||||
/** Required by ReferenceOrderedDatum interface; this method does nothing (always returns false),
|
||||
* since this rod provides its own iterator for reading underlying data files.
|
||||
*/
|
||||
public boolean parseLine(Object header, String[] parts) {
|
||||
return false; // this rod has its own iterator
|
||||
}
|
||||
|
||||
public String repl() {
|
||||
throw new StingException("repl() is not implemented yet");
|
||||
}
|
||||
|
||||
public String toSimpleString() { return loc.toString(); }
|
||||
public String toString() { return toSimpleString(); }
|
||||
|
||||
public static IntervalRodIterator createIterator(String trackName, File f) throws IOException {
|
||||
return IntervalRodIterator.IntervalRodIteratorFromLocsFile(trackName, f);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,52 @@
|
|||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.BasicReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
|
||||
public class IntervalRodIterator implements Iterator<IntervalRod> {
|
||||
private List<GenomeLoc> locations = null;
|
||||
private Iterator<GenomeLoc> iter;
|
||||
private String trackName = null;
|
||||
|
||||
//public static RODIterator<IntervalRod> IntervalRodIteratorFromLocs(List<GenomeLoc> locs) {
|
||||
// IntervalRodIterator it = new IntervalRodIterator("interval", locs);
|
||||
// return new RODIterator<IntervalRod>(it);
|
||||
//}
|
||||
|
||||
public static IntervalRodIterator IntervalRodIteratorFromLocsFile(final String trackName, final File file) {
|
||||
//System.out.printf("Parsing %s for intervals %s%n", file, trackName);
|
||||
List<GenomeLoc> locs = GenomeAnalysisEngine.parseIntervalRegion(file.getPath(), true);
|
||||
//System.out.printf(" => got %d entries %n", locs.size());
|
||||
return new IntervalRodIterator(trackName, locs);
|
||||
}
|
||||
|
||||
public IntervalRodIterator(String trackName, List<GenomeLoc> locs) {
|
||||
this.trackName = trackName;
|
||||
locations = locs;
|
||||
iter = locations.iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return iter.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntervalRod next() {
|
||||
IntervalRod r = new IntervalRod(trackName, iter.next());
|
||||
//System.out.printf("IntervalRod next is %s%n", r);
|
||||
return r;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
|
@ -22,7 +22,7 @@ import org.apache.log4j.Logger;
|
|||
* chr1:1104842 A N 0.000000 -84.816002 -84.816002 0.000000 0.000000 324.000000 162 0 0
|
||||
*
|
||||
*/
|
||||
public class PooledEMSNPROD extends TabularROD implements AllelicVariant {
|
||||
public class PooledEMSNPROD extends TabularROD implements SNPCallFromGenotypes {
|
||||
public PooledEMSNPROD(final String name) {
|
||||
super(name);
|
||||
}
|
||||
|
|
@ -45,4 +45,11 @@ public class PooledEMSNPROD extends TabularROD implements AllelicVariant {
|
|||
public List<String> getGenotype() throws IllegalStateException { throw new IllegalStateException(); }
|
||||
public int getPloidy() throws IllegalStateException { return 2; }
|
||||
public boolean isBiallelic() { return true; }
|
||||
|
||||
// SNPCallFromGenotypes interface
|
||||
public int nIndividuals() { return Integer.parseInt(this.get("EM_N")); }
|
||||
public int nHomRefGenotypes() { return Integer.parseInt(this.get("n_ref")); }
|
||||
public int nHetGenotypes() { return Integer.parseInt(this.get("n_het")); }
|
||||
public int nHomVarGenotypes() { return Integer.parseInt(this.get("n_hom")); }
|
||||
public List<Genotype> getGenotypes() { return null; }
|
||||
}
|
||||
|
|
@ -0,0 +1,95 @@
|
|||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
public class RODIterator<ROD extends ReferenceOrderedDatum> implements Iterator<ROD> {
|
||||
private PushbackIterator<ROD> it;
|
||||
private ROD current = null;
|
||||
private GenomeLoc position = null;
|
||||
|
||||
public RODIterator(Iterator<ROD> it) {
|
||||
this.it = new PushbackIterator<ROD>(it);
|
||||
}
|
||||
|
||||
public boolean hasNext() { return it.hasNext(); }
|
||||
public ROD next() {
|
||||
ROD next = it.next();
|
||||
if( next != null ) {
|
||||
position = next.getLocation().clone();
|
||||
current = next;
|
||||
}
|
||||
return next;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the current position of this iterator.
|
||||
* @return Current position of the iterator, or null if no position exists.
|
||||
*/
|
||||
public GenomeLoc position() {
|
||||
return position;
|
||||
}
|
||||
|
||||
/**
|
||||
* Seeks forward in the file until we reach (or cross) a record at contig / pos
|
||||
* If we don't find anything and cross beyond contig / pos, we return null
|
||||
* Otherwise we return the first object who's start is at pos
|
||||
*
|
||||
* @param loc
|
||||
* @return
|
||||
*/
|
||||
public ROD seekForward(final GenomeLoc loc) {
|
||||
final boolean DEBUG = false;
|
||||
|
||||
ROD result = null;
|
||||
|
||||
//if (current != null && current.getName().equals("interval")) {
|
||||
// boolean contains = current.getLocation().containsP(loc);
|
||||
// System.out.printf(" %s : current is %s, seeking to %s, contains %b%n", current.getName(), current.getLocation(), loc, contains);
|
||||
//}
|
||||
|
||||
if ( current != null && current.getLocation().containsP(loc) )
|
||||
return current;
|
||||
|
||||
if ( DEBUG ) System.out.printf(" *** starting seek to %s %d%n", loc.getContig(), loc.getStart());
|
||||
while ( hasNext() ) {
|
||||
ROD proposed = next();
|
||||
if( proposed == null )
|
||||
continue;
|
||||
//System.out.printf(" -> Seeking to %s %d AT %s %d%n", contigName, pos, current.getContig(), current.getStart());
|
||||
boolean containedP = proposed.getLocation().containsP(loc);
|
||||
//System.out.printf(" %s -> Seeking to %s, at %s => contains = %b%n", current.getName(), loc, current.getLocation(), containedP);
|
||||
int cmp = proposed.getLocation().compareTo(loc);
|
||||
if ( cmp < 0 ) {
|
||||
// current occurs before loc, continue searching
|
||||
continue;
|
||||
}
|
||||
else if ( cmp == 0 || containedP ) {
|
||||
result = proposed;
|
||||
break;
|
||||
} else {
|
||||
// current is after loc
|
||||
it.pushback(proposed);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( DEBUG ) {
|
||||
if ( result != null )
|
||||
System.out.printf(" ### Found %s%n", result.getLocation());
|
||||
}
|
||||
|
||||
// make a note that the iterator last seeked to the specified position
|
||||
current = result;
|
||||
position = loc.clone();
|
||||
|
||||
// we ran out of elements or found something
|
||||
return result;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
|
@ -94,7 +94,7 @@ public class RefMetaDataTracker {
|
|||
* @param rod
|
||||
*/
|
||||
public void bind(final String name, ReferenceOrderedDatum rod) {
|
||||
logger.debug(String.format("Binding %s to %s%n", name, rod));
|
||||
//logger.debug(String.format("Binding %s to %s", name, rod));
|
||||
map.put(canonicalName(name), rod);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -66,6 +66,7 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
|||
addModule("RefSeq", rodRefSeq.class);
|
||||
addModule("Table", TabularROD.class);
|
||||
addModule("PooledEM", PooledEMSNPROD.class);
|
||||
addModule("Intervals", IntervalRod.class);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -159,7 +160,7 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
|||
return this.name.equals(name) && type.isAssignableFrom(this.type);
|
||||
}
|
||||
|
||||
public RODIterator iterator() {
|
||||
public RODIterator<ROD> iterator() {
|
||||
Iterator<ROD> it;
|
||||
try {
|
||||
Method m = type.getDeclaredMethod("createIterator", String.class,java.io.File.class);
|
||||
|
|
@ -177,7 +178,7 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
|||
} catch ( java.lang.reflect.InvocationTargetException e ) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
return new RODIterator(it);
|
||||
return new RODIterator<ROD>(it);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
|
|
@ -301,103 +302,6 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
|||
}
|
||||
}
|
||||
|
||||
// private class SimpleRODIterator implements Iterator<ROD> {
|
||||
// //private WhitespaceTextFileParser parser = null;
|
||||
// private TabbedTextFileParser parser = null;
|
||||
//
|
||||
// public SimpleRODIterator() {
|
||||
// parser = new TabbedTextFileParser(true, file);
|
||||
// }
|
||||
//
|
||||
// public boolean hasNext() {
|
||||
// return parser.hasNext();
|
||||
// }
|
||||
//
|
||||
// public ROD next() {
|
||||
// String parts[] = parser.next();
|
||||
// return parseLine(parts);
|
||||
// }
|
||||
//
|
||||
// public void remove() {
|
||||
// throw new UnsupportedOperationException();
|
||||
// }
|
||||
// }
|
||||
|
||||
public class RODIterator implements Iterator<ROD> {
|
||||
private PushbackIterator<ROD> it;
|
||||
private GenomeLoc position = null;
|
||||
|
||||
public RODIterator(Iterator<ROD> it) {
|
||||
this.it = new PushbackIterator<ROD>(it);
|
||||
}
|
||||
|
||||
public boolean hasNext() { return it.hasNext(); }
|
||||
public ROD next() {
|
||||
ROD next = it.next();
|
||||
if( next != null )
|
||||
position = next.getLocation().clone();
|
||||
return next;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the current position of this iterator.
|
||||
* @return Current position of the iterator, or null if no position exists.
|
||||
*/
|
||||
public GenomeLoc position() {
|
||||
return position;
|
||||
}
|
||||
|
||||
/**
|
||||
* Seeks forward in the file until we reach (or cross) a record at contig / pos
|
||||
* If we don't find anything and cross beyond contig / pos, we return null
|
||||
* Otherwise we return the first object who's start is at pos
|
||||
*
|
||||
* @param loc
|
||||
* @return
|
||||
*/
|
||||
public ROD seekForward(final GenomeLoc loc) {
|
||||
final boolean DEBUG = false;
|
||||
|
||||
ROD result = null;
|
||||
|
||||
if ( DEBUG ) System.out.printf(" *** starting seek to %s %d%n", loc.getContig(), loc.getStart());
|
||||
while ( hasNext() ) {
|
||||
ROD current = next();
|
||||
if( current == null )
|
||||
continue;
|
||||
//System.out.printf(" -> Seeking to %s %d AT %s %d%n", contigName, pos, current.getContig(), current.getStart());
|
||||
int cmp = current.getLocation().compareTo(loc);
|
||||
if ( cmp < 0 ) {
|
||||
// current occurs before loc, continue searching
|
||||
continue;
|
||||
}
|
||||
else if ( cmp == 0 ) {
|
||||
result = current;
|
||||
break;
|
||||
} else {
|
||||
// current is after loc
|
||||
it.pushback(current);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( DEBUG ) {
|
||||
if ( result != null )
|
||||
System.out.printf(" ### Found %s%n", result.getLocation());
|
||||
}
|
||||
|
||||
// make a note that the iterator last seeked to the specified position
|
||||
position = loc.clone();
|
||||
|
||||
// we ran out of elements or found something
|
||||
return result;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Parsing
|
||||
|
|
|
|||
|
|
@ -0,0 +1,31 @@
|
|||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.MatchResult;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.xReadLines;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
/**
|
||||
* loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom
|
||||
* chr1:1104840 A N 0.000000 -85.341265 -85.341265 0.000000 0.000000 324.000000 162 0 0
|
||||
* chr1:1104841 C N 0.000000 -69.937928 -69.937928 0.000000 0.000000 324.000000 162 0 0
|
||||
* chr1:1104842 A N 0.000000 -84.816002 -84.816002 0.000000 0.000000 324.000000 162 0 0
|
||||
*
|
||||
*/
|
||||
public interface SNPCallFromGenotypes extends AllelicVariant {
|
||||
public int nIndividuals();
|
||||
public int nHomRefGenotypes();
|
||||
public int nHetGenotypes();
|
||||
public int nHomVarGenotypes();
|
||||
public List<Genotype> getGenotypes();
|
||||
}
|
||||
|
|
@ -11,9 +11,7 @@ import net.sf.samtools.SAMRecord;
|
|||
import net.sf.samtools.util.RuntimeIOException;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.iterators.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.dataSources.providers.ShardDataProvider;
|
||||
|
|
@ -22,7 +20,6 @@ import org.broadinstitute.sting.utils.*;
|
|||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.*;
|
||||
|
||||
public abstract class TraversalEngine {
|
||||
|
|
@ -30,7 +27,7 @@ public abstract class TraversalEngine {
|
|||
protected List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods = null;
|
||||
|
||||
// Iterator over rods
|
||||
List<Pair<String, ReferenceOrderedData<? extends ReferenceOrderedDatum>.RODIterator>> rodIters;
|
||||
List<Pair<String, RODIterator<? extends ReferenceOrderedDatum>>> rodIters;
|
||||
|
||||
// How strict should we be with SAM/BAM parsing?
|
||||
protected ValidationStringency strictness = ValidationStringency.STRICT;
|
||||
|
|
@ -227,6 +224,10 @@ public abstract class TraversalEngine {
|
|||
return this.locs != null && !this.locs.isEmpty();
|
||||
}
|
||||
|
||||
public List<GenomeLoc> getLocations() {
|
||||
return this.locs;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// printing
|
||||
|
|
@ -446,9 +447,9 @@ public abstract class TraversalEngine {
|
|||
*/
|
||||
protected void initializeRODs() {
|
||||
// set up reference ordered data
|
||||
rodIters = new ArrayList<Pair<String, ReferenceOrderedData<? extends ReferenceOrderedDatum>.RODIterator>>();
|
||||
rodIters = new ArrayList<Pair<String, RODIterator<? extends ReferenceOrderedDatum>>>();
|
||||
for (ReferenceOrderedData<? extends ReferenceOrderedDatum> data : rods) {
|
||||
rodIters.add(new Pair<String, ReferenceOrderedData<? extends ReferenceOrderedDatum>.RODIterator>(data.getName(), data.iterator()));
|
||||
rodIters.add(new Pair<String, RODIterator<? extends ReferenceOrderedDatum>>(data.getName(), data.iterator()));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -492,10 +493,11 @@ public abstract class TraversalEngine {
|
|||
*/
|
||||
protected RefMetaDataTracker getReferenceOrderedDataAtLocus(final GenomeLoc loc) {
|
||||
RefMetaDataTracker tracks = new RefMetaDataTracker();
|
||||
for (Pair<String, ReferenceOrderedData<? extends ReferenceOrderedDatum>.RODIterator> pair : rodIters) {
|
||||
for (Pair<String, RODIterator<? extends ReferenceOrderedDatum>> pair : rodIters) {
|
||||
String name = pair.getFirst();
|
||||
tracks.bind(name, pair.getSecond().seekForward(loc));
|
||||
}
|
||||
|
||||
return tracks;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,42 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
public abstract class BasicVariantAnalysis implements VariantAnalysis {
|
||||
protected String name;
|
||||
protected PrintStream out;
|
||||
protected VariantEvalWalker master;
|
||||
|
||||
public BasicVariantAnalysis(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public List<String> getParams() {
|
||||
return new ArrayList<String>();
|
||||
}
|
||||
|
||||
public void initialize(VariantEvalWalker master, PrintStream out) {
|
||||
this.master = master;
|
||||
this.out = out;
|
||||
}
|
||||
|
||||
public PrintStream getPrintStream() {
|
||||
return out;
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
return new ArrayList<String>();
|
||||
}
|
||||
|
||||
public abstract String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context);
|
||||
}
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.SNPCallFromGenotypes;
|
||||
import org.broadinstitute.sting.gatk.refdata.PooledEMSNPROD;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import cern.jet.math.Arithmetic;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: Jun 4, 2009
|
||||
* Time: 4:38:00 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class HardyWeinbergEquilibrium extends BasicVariantAnalysis {
|
||||
private double threshold;
|
||||
int nSites = 0;
|
||||
int nViolations = 0;
|
||||
|
||||
HardyWeinbergEquilibrium(double threshold) {
|
||||
super("hwe");
|
||||
this.threshold = threshold;
|
||||
}
|
||||
|
||||
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
String r = null;
|
||||
|
||||
if ( eval != null &&
|
||||
eval instanceof SNPCallFromGenotypes ) {
|
||||
nSites++;
|
||||
SNPCallFromGenotypes call = (SNPCallFromGenotypes)eval;
|
||||
|
||||
int nAA = call.nHomRefGenotypes();
|
||||
int nAa = call.nHetGenotypes();
|
||||
int naa = call.nHomVarGenotypes();
|
||||
int nA = 2 * nAA + nAa;
|
||||
int n = nAA + nAa + naa;
|
||||
|
||||
//
|
||||
// from Emigh 1980
|
||||
//
|
||||
// P = pr[nAa | nA] = multinomial[n over nAA, nAa, naa] / binomial[2n over nA] * 2^nAa
|
||||
//
|
||||
// where nAA, nAa, naa are the observed numbers of the three genotypes, AA, Aa, and aa,
|
||||
// respectively, and nA is the number of A alleles, where nA = 2nAA + nAa, and n is the number of alleles
|
||||
//
|
||||
int[] mXs = { nAA, nAa, naa }; // counts of each genotype as vector
|
||||
double m = MathUtils.multinomial(mXs);
|
||||
double b = Arithmetic.binomial(2 * n, nA);
|
||||
double tosses = Math.pow(2, nAa);
|
||||
double p = (m / b) * tosses;
|
||||
|
||||
if ( false ) {
|
||||
System.out.printf("HWE-violation at %s %f < %f %1.2f %5d %5d %5d %5d %5d %.2e %.2e %.2e => %.6e [%s]%n",
|
||||
call.getLocation(), p, threshold, call.getMAF(), nAA, nAa, naa, nA, n, m, b, tosses, p, eval);
|
||||
System.out.printf("(factorial(%d) / (factorial(%d) * factorial(%d) * factorial(%d))) / choose(%d, %d) * 2^%d - %f < 1e-3%n",
|
||||
nAA + nAa + naa, nAA, nAa, naa, 2 * n, nA, nAa, p);
|
||||
}
|
||||
|
||||
if ( p < threshold ) {
|
||||
r = String.format("HWE-violation %f < %f at %s", p, threshold, eval);
|
||||
nViolations++;
|
||||
}
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
List<String> s = new ArrayList<String>();
|
||||
s.add(String.format("n_calls %d", nSites));
|
||||
s.add(String.format("n_violations %d", nViolations));
|
||||
s.add(String.format("violations_rate %.2f", (100.0*nSites) / nViolations));
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,86 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.IntervalRod;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: Jun 4, 2009
|
||||
* Time: 4:38:19 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class PairwiseDistanceAnalysis extends BasicVariantAnalysis {
|
||||
ArrayList<Long> pairWiseDistances;
|
||||
int[] pairWiseBoundries = {1, 2, 5, 10, 20, 50, 100, 1000, 10000};
|
||||
|
||||
AllelicVariant lastVariant = null;
|
||||
GenomeLoc lastVariantInterval = null;
|
||||
|
||||
public PairwiseDistanceAnalysis() {
|
||||
super("pairwise_distances");
|
||||
pairWiseDistances = new ArrayList<Long>();
|
||||
}
|
||||
|
||||
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
String r = null;
|
||||
|
||||
if ( eval != null ) {
|
||||
IntervalRod intervalROD = (IntervalRod)tracker.lookup("interval", null);
|
||||
GenomeLoc interval = intervalROD == null ? null : intervalROD.getLocation();
|
||||
|
||||
if (lastVariant != null) {
|
||||
GenomeLoc eL = eval.getLocation();
|
||||
GenomeLoc lvL = lastVariant.getLocation();
|
||||
if (eL.getContigIndex() == lvL.getContigIndex()) {
|
||||
long d = eL.distance(lvL);
|
||||
if ( lastVariantInterval != null && lastVariantInterval.compareTo(interval) != 0) {
|
||||
// we're on different intervals
|
||||
//out.printf("# Excluding %d %s %s vs. %s %s%n", d, eL, interval, lvL, lastVariantInterval);
|
||||
} else {
|
||||
pairWiseDistances.add(d);
|
||||
r = String.format("Pairwise-distance %d %s %s%n", d, eL, lvL);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lastVariant = eval;
|
||||
lastVariantInterval = interval;
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
int[] pairCounts = new int[pairWiseBoundries.length];
|
||||
Arrays.fill(pairCounts, 0);
|
||||
for ( long dist : pairWiseDistances ) {
|
||||
boolean done = false;
|
||||
for ( int i = 0; i < pairWiseBoundries.length && ! done ; i++ ) {
|
||||
int maxDist = pairWiseBoundries[i];
|
||||
if ( dist <= maxDist ) {
|
||||
pairCounts[i]++;
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
List<String> s = new ArrayList<String>();
|
||||
s.add(String.format("snps counted for pairwise distance: %d", pairWiseDistances.size()));
|
||||
for ( int i = 0; i < pairWiseBoundries.length; i++ ) {
|
||||
int maxDist = pairWiseBoundries[i];
|
||||
int count = pairCounts[i];
|
||||
s.add(String.format("snps within %8d bp: %d", maxDist, count));
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,74 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: Jun 4, 2009
|
||||
* Time: 4:38:00 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class TransitionTranversionAnalysis extends BasicVariantAnalysis {
|
||||
int N_TRANSITION_TRANVERSION_BINS = 100;
|
||||
Histogram<Integer> transitions;
|
||||
Histogram<Integer> transversions;
|
||||
|
||||
public TransitionTranversionAnalysis() {
|
||||
super("transitions_transversions");
|
||||
transitions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
|
||||
transversions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
|
||||
}
|
||||
|
||||
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
if ( eval != null && eval.isSNP() ) {
|
||||
char refBase = eval.getRefSnpFWD();
|
||||
char altBase = eval.getAltSnpFWD();
|
||||
//System.out.printf("%c %c%n", refBase, altBase);
|
||||
//int i = transition_transversion_bin(dbsnp.getHeterozygosity());
|
||||
//System.out.printf("MAF = %f => %d%n", dbsnp.getMAF(), i);
|
||||
//EnumMap<BaseUtils.BaseSubstitutionType, Integer> bin = transition_transversion_counts[i];
|
||||
|
||||
BaseUtils.BaseSubstitutionType subType = BaseUtils.SNPSubstitutionType(refBase, altBase);
|
||||
Histogram<Integer> h = subType == BaseUtils.BaseSubstitutionType.TRANSITION ? transitions : transversions;
|
||||
double het = eval.getHeterozygosity();
|
||||
h.setBin(het, h.getBin(het) + 1);
|
||||
//int sit = bin.get(BaseUtils.BaseSubstitutionType.TRANSITION);
|
||||
//int ver = bin.get(BaseUtils.BaseSubstitutionType.TRANSVERSION);
|
||||
//System.out.printf("%s %.2f %s%n", subType, dbsnp.getHeterozygosity(), h.x2bin(logHet), dbsnp.toString());
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
int nTransitions = 0;
|
||||
int nTransversions = 0;
|
||||
|
||||
List<String> s = new ArrayList<String>();
|
||||
for ( int i = 0; i < N_TRANSITION_TRANVERSION_BINS; i++ ) {
|
||||
//double avHet = Math.pow(10, transitions.bin2x(i));
|
||||
double avHet = transitions.bin2x(i);
|
||||
if ( avHet > 0.5 ) break;
|
||||
|
||||
int sit = transitions.getBin(i);
|
||||
int ver = transversions.getBin(i);
|
||||
nTransitions += sit;
|
||||
nTransversions += ver;
|
||||
double ratio = (float)sit/ver;
|
||||
//s.append(String.format("%s %s %.2f %d %d %f%n", commentLine, prefix, avHet, sit, ver, ratio));
|
||||
}
|
||||
|
||||
s.add(String.format("transitions %d", nTransitions));
|
||||
s.add(String.format("transversions %d", nTransversions));
|
||||
s.add(String.format("ratio %.2f", nTransitions / (1.0 * nTransversions)));
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
|
||||
public interface VariantAnalysis {
|
||||
public String getName();
|
||||
public PrintStream getPrintStream();
|
||||
public List<String> getParams();
|
||||
public void initialize(VariantEvalWalker master, PrintStream out);
|
||||
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context);
|
||||
public List<String> done();
|
||||
}
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
public class VariantCounter extends BasicVariantAnalysis {
|
||||
int nBasesCovered = 0;
|
||||
int nSNPs = 0;
|
||||
|
||||
public VariantCounter() {
|
||||
super("variant_counts");
|
||||
}
|
||||
|
||||
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
nBasesCovered++;
|
||||
nSNPs += eval == null ? 0 : 1;
|
||||
return null;
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
List<String> s = new ArrayList<String>();
|
||||
s.add(String.format("n bases covered: %d", nBasesCovered));
|
||||
s.add(String.format("sites: %d", nSNPs));
|
||||
s.add(String.format("variant rate: %.5f confident variants per base", nSNPs / (1.0 * nBasesCovered)));
|
||||
s.add(String.format("variant rate: 1 / %d confident variants per base", nBasesCovered / nSNPs));
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,12 +1,22 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||
|
||||
public class VariantDBCoverage {
|
||||
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
|
||||
public class VariantDBCoverage extends BasicVariantAnalysis {
|
||||
private String dbName;
|
||||
private int nDBObs = 0;
|
||||
private int nEvalObs = 0;
|
||||
private int nOverlapping = 0;
|
||||
|
||||
public VariantDBCoverage(final String name) {
|
||||
super("db_coverage");
|
||||
dbName = name;
|
||||
}
|
||||
|
||||
|
|
@ -37,6 +47,13 @@ public class VariantDBCoverage {
|
|||
return nOverlappingSites() / (1.0 * nEvalSites());
|
||||
}
|
||||
|
||||
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
// There are four cases here:
|
||||
AllelicVariant dbsnp = (AllelicVariant)tracker.lookup(dbName, null);
|
||||
inc(dbsnp != null, eval != null);
|
||||
return dbsnp == null && eval != null ? "Novel " + eval : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* What fraction of the DB sites were discovered in the evalution calls?
|
||||
*
|
||||
|
|
@ -46,18 +63,15 @@ public class VariantDBCoverage {
|
|||
return nOverlappingSites() / (1.0 * nDBSites());
|
||||
}
|
||||
|
||||
public String toSingleLineString(final String prefix) {
|
||||
return String.format("%s %d\t%d\t%d\t%.2f\t%.2f", prefix, nDBSites(), nEvalSites(), nOverlappingSites(), fractionEvalSitesCoveredByDB(), fractionDBSitesDiscoveredInEval());
|
||||
}
|
||||
|
||||
public String toMultiLineString(final String prefix) {
|
||||
StringBuilder s = new StringBuilder();
|
||||
s.append(String.format("%s name %s%n", prefix, dbName));
|
||||
s.append(String.format("%s n_db_sites %d%n", prefix, nDBSites()));
|
||||
s.append(String.format("%s n_eval_sites %d%n", prefix, nEvalSites()));
|
||||
s.append(String.format("%s n_overlapping_sites %d%n", prefix, nOverlappingSites()));
|
||||
s.append(String.format("%s per_eval_sites_in_db %.2f%n", prefix, 100*fractionEvalSitesCoveredByDB()));
|
||||
s.append(String.format("%s per_db_sites_in_eval %.2f%n", prefix, 100*fractionDBSitesDiscoveredInEval()));
|
||||
return s.toString();
|
||||
public List<String> done() {
|
||||
List<String> s = new ArrayList<String>();
|
||||
s.add(String.format("%d\t%d\t%d\t%.2f\t%.2f", nDBSites(), nEvalSites(), nOverlappingSites(), fractionEvalSitesCoveredByDB(), fractionDBSitesDiscoveredInEval()));
|
||||
s.add(String.format("name %s", dbName));
|
||||
s.add(String.format("n_db_sites %d", nDBSites()));
|
||||
s.add(String.format("n_eval_sites %d", nEvalSites()));
|
||||
s.add(String.format("n_overlapping_sites %d", nOverlappingSites()));
|
||||
s.add(String.format("per_eval_sites_in_db %.2f", 100*fractionEvalSitesCoveredByDB()));
|
||||
s.add(String.format("per_db_sites_in_eval %.2f", 100*fractionDBSitesDiscoveredInEval()));
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,24 +1,16 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.util.EnumMap;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mdepristo
|
||||
* Date: Feb 22, 2009
|
||||
* Time: 2:52:28 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
@By(DataSource.REFERENCE)
|
||||
@Requires(DataSource.REFERENCE)
|
||||
@Allows(DataSource.REFERENCE)
|
||||
|
|
@ -29,110 +21,114 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
|||
@Argument(shortName="printVariants", doc="If true, prints the variants in all of the variant tracks that are examined", required=false)
|
||||
public boolean printVariants = false;
|
||||
|
||||
int nBasesCovered = 0;
|
||||
int nSNPs = 0;
|
||||
VariantDBCoverage dbSNPStats = new VariantDBCoverage("dbSNP");
|
||||
@Argument(shortName="badHWEThreshold", doc="XXX", required=false)
|
||||
public double badHWEThreshold = 0.001;
|
||||
|
||||
int N_TRANSITION_TRANVERSION_BINS = 100;
|
||||
Histogram<Integer> transitions;
|
||||
Histogram<Integer> transversions;
|
||||
String analysisFilenameBase = null;
|
||||
|
||||
@Argument(shortName="outputFilenameBase", doc="", required=false)
|
||||
String outputFilenameBase = null;
|
||||
String COMMENT_STRING = "";
|
||||
|
||||
ArrayList<Long> pairWiseDistances;
|
||||
int[] pairWiseBoundries = {1, 10, 100, 1000, 10000, 1000000000};
|
||||
AllelicVariant lastVariant = null;
|
||||
HashMap<String, ArrayList<VariantAnalysis>> analysisSets;
|
||||
|
||||
//EnumMap<BaseUtils.BaseSubstitutionType, Integer> transition_transversion_counts[];
|
||||
final String ALL_SNPS = "all";
|
||||
final String SINGLETON_SNPS = "singletons";
|
||||
final String TWOHIT_SNPS = "2plus_hit";
|
||||
final String[] ALL_ANALYSIS_NAMES = { ALL_SNPS, SINGLETON_SNPS, TWOHIT_SNPS };
|
||||
|
||||
public void initialize() {
|
||||
//transitions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, Math.log10(1e-10), 0.0, 0);
|
||||
//transversions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, Math.log10(1e-10), 0.0, 0);
|
||||
transitions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
|
||||
transversions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
|
||||
pairWiseDistances = new ArrayList<Long>();
|
||||
// setup the path to the analysis
|
||||
if ( this.getToolkit().getArguments().outFileName != null ) {
|
||||
analysisFilenameBase = this.getToolkit().getArguments().outFileName + ".analysis.";
|
||||
}
|
||||
|
||||
analysisSets = new HashMap<String, ArrayList<VariantAnalysis>>();
|
||||
for ( String setName : ALL_ANALYSIS_NAMES ) {
|
||||
analysisSets.put(setName, initializeAnalysisSet(setName));
|
||||
}
|
||||
}
|
||||
|
||||
//private int transition_transversion_bin(double het) {
|
||||
// return (int)Math.floor(het * N_TRANSITION_TRANVERSION_BINS);
|
||||
//}
|
||||
private ArrayList<VariantAnalysis> getAnalysisSet(final String name) {
|
||||
return analysisSets.get(name);
|
||||
}
|
||||
|
||||
//private double transition_transversion_bin2het(int bin) {
|
||||
// return (bin + 0.5) / N_TRANSITION_TRANVERSION_BINS;
|
||||
//}
|
||||
private ArrayList<VariantAnalysis> initializeAnalysisSet(final String setName) {
|
||||
ArrayList<VariantAnalysis> analyses = new ArrayList<VariantAnalysis>();
|
||||
|
||||
//
|
||||
// Add new analyzes here!
|
||||
//
|
||||
analyses.add(new VariantCounter());
|
||||
analyses.add(new VariantDBCoverage("dbSNP"));
|
||||
analyses.add(new TransitionTranversionAnalysis());
|
||||
analyses.add(new PairwiseDistanceAnalysis());
|
||||
analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold));
|
||||
|
||||
|
||||
if ( printVariants ) analyses.add(new VariantMatcher("dbSNP"));
|
||||
|
||||
for ( VariantAnalysis analysis : analyses ) {
|
||||
analysis.initialize(this, openAnalysisOutputStream(setName, analysis));
|
||||
}
|
||||
|
||||
return analyses;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the filename of the analysis output file where output for an analysis with
|
||||
* @param name
|
||||
* @param params
|
||||
* @return
|
||||
*/
|
||||
public String getAnalysisFilename(final String name, final List<String> params) {
|
||||
if ( analysisFilenameBase == null )
|
||||
return null;
|
||||
else
|
||||
return analysisFilenameBase + Utils.join(".", Utils.cons(name, params));
|
||||
}
|
||||
|
||||
public PrintStream openAnalysisOutputStream(final String setName, VariantAnalysis analysis) {
|
||||
final String filename = getAnalysisFilename(setName + "." + analysis.getName(), analysis.getParams());
|
||||
if ( filename == null )
|
||||
return out;
|
||||
else {
|
||||
File file = new File(filename);
|
||||
try {
|
||||
return new PrintStream(new FileOutputStream(file));
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("Couldn't open analysis output file " + filename, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
nBasesCovered++;
|
||||
|
||||
AllelicVariant dbsnp = (AllelicVariant)tracker.lookup("dbSNP", null);
|
||||
// Iterate over each analysis, and update it
|
||||
AllelicVariant eval = (AllelicVariant)tracker.lookup("eval", null);
|
||||
if ( eval != null && eval.getVariationConfidence() < minDiscoveryQ )
|
||||
eval = null;
|
||||
|
||||
if ( printVariants && ( eval != null || dbsnp != null ) ) {
|
||||
String matchFlag = " ";
|
||||
if ( eval != null && dbsnp != null ) matchFlag = "*** ";
|
||||
if ( eval == null && dbsnp != null ) matchFlag = ">>> ";
|
||||
if ( eval != null && dbsnp == null ) matchFlag = "<<< ";
|
||||
updateAnalysisSet(ALL_SNPS, eval, tracker, ref, context);
|
||||
|
||||
System.out.printf("%sDBSNP: %s%n%sEVAL:%s%n",
|
||||
matchFlag, dbsnp,
|
||||
matchFlag, eval);
|
||||
if ( eval instanceof SNPCallFromGenotypes ) {
|
||||
SNPCallFromGenotypes call = (PooledEMSNPROD)eval;
|
||||
int nVarGenotypes = call.nHetGenotypes() + call.nHomVarGenotypes();
|
||||
//System.out.printf("%d variant genotypes at %s%n", nVarGenotypes, calls);
|
||||
final String s = nVarGenotypes == 1 ? SINGLETON_SNPS : TWOHIT_SNPS;
|
||||
updateAnalysisSet(s, eval, tracker, ref, context);
|
||||
}
|
||||
|
||||
if ( eval != null ) {
|
||||
//System.out.printf("GREP ME%n");
|
||||
if ( ! eval.isSNP() || eval.getVariationConfidence() < minDiscoveryQ )
|
||||
System.out.printf("We have a problem at %s: %b %f%n", eval, eval.isSNP(), eval.getVariationConfidence());
|
||||
}
|
||||
|
||||
if ( eval != null && eval.isSNP() && eval.getVariationConfidence() >= minDiscoveryQ ) {
|
||||
//System.out.printf("%s has: %nDBSNP: %s%nEVAL:%s%n", context.getLocation(), dbsnp, eval);
|
||||
//System.out.printf("N snps %d = %s%n", ++nSNPs, eval.getLocation());
|
||||
|
||||
updateTransitionTransversion(eval, ref, context);
|
||||
|
||||
if (lastVariant != null) updatePairwiseDistances(eval, lastVariant);
|
||||
lastVariant = eval;
|
||||
//updateHapMapRate(dbsnp, eval, ref, context);
|
||||
}
|
||||
updateVariantDBCoverage(dbsnp, eval, ref, context);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
private void updatePairwiseDistances(AllelicVariant eval, AllelicVariant lastVariant) {
|
||||
GenomeLoc eL = eval.getLocation();
|
||||
GenomeLoc lvL = lastVariant.getLocation();
|
||||
if (eL.getContigIndex() == lvL.getContigIndex()) {
|
||||
long d = eL.distance(lvL);
|
||||
pairWiseDistances.add(d);
|
||||
out.printf("# Pairwise-distance %d %s %s%n", d, eL, lvL);
|
||||
|
||||
public void updateAnalysisSet(final String analysisSetName, AllelicVariant eval,
|
||||
RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
// Iterate over each analysis, and update it
|
||||
for ( VariantAnalysis analysis : getAnalysisSet(analysisSetName) ) {
|
||||
String s = analysis.update(eval, tracker, ref, context);
|
||||
if ( s != null ) analysis.getPrintStream().println(s);
|
||||
}
|
||||
}
|
||||
|
||||
private void updateTransitionTransversion(AllelicVariant dbsnp, char ref, LocusContext context) {
|
||||
char refBase = dbsnp.getRefSnpFWD();
|
||||
char altBase = dbsnp.getAltSnpFWD();
|
||||
//System.out.printf("%c %c%n", refBase, altBase);
|
||||
//int i = transition_transversion_bin(dbsnp.getHeterozygosity());
|
||||
//System.out.printf("MAF = %f => %d%n", dbsnp.getMAF(), i);
|
||||
//EnumMap<BaseUtils.BaseSubstitutionType, Integer> bin = transition_transversion_counts[i];
|
||||
|
||||
BaseUtils.BaseSubstitutionType subType = BaseUtils.SNPSubstitutionType(refBase, altBase);
|
||||
Histogram<Integer> h = subType == BaseUtils.BaseSubstitutionType.TRANSITION ? transitions : transversions;
|
||||
double het = dbsnp.getHeterozygosity();
|
||||
h.setBin(het, h.getBin(het) + 1);
|
||||
//int sit = bin.get(BaseUtils.BaseSubstitutionType.TRANSITION);
|
||||
//int ver = bin.get(BaseUtils.BaseSubstitutionType.TRANSVERSION);
|
||||
//System.out.printf("%s %.2f %s%n", subType, dbsnp.getHeterozygosity(), h.x2bin(logHet), dbsnp.toString());
|
||||
|
||||
}
|
||||
|
||||
private void updateVariantDBCoverage(AllelicVariant dbsnp, AllelicVariant eval, char ref, LocusContext context) {
|
||||
// There are four cases here:
|
||||
dbSNPStats.inc(dbsnp != null, eval != null);
|
||||
}
|
||||
|
||||
// Given result of map function
|
||||
public Integer reduceInit() { return 0; }
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
|
|
@ -143,54 +139,24 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
public void onTraversalDone(Integer result) {
|
||||
int nTransitions = 0;
|
||||
int nTransversions = 0;
|
||||
|
||||
StringBuilder s = new StringBuilder();
|
||||
for ( int i = 0; i < N_TRANSITION_TRANVERSION_BINS; i++ ) {
|
||||
//double avHet = Math.pow(10, transitions.bin2x(i));
|
||||
double avHet = transitions.bin2x(i);
|
||||
if ( avHet > 0.5 ) break;
|
||||
|
||||
int sit = transitions.getBin(i);
|
||||
int ver = transversions.getBin(i);
|
||||
nTransitions += sit;
|
||||
nTransversions += ver;
|
||||
double ratio = (float)sit/ver;
|
||||
s.append(String.format("%.2f %d %d %f%n", avHet, sit, ver, ratio));
|
||||
for ( String analysisSetName : ALL_ANALYSIS_NAMES ) {
|
||||
printAnalysisSet(analysisSetName);
|
||||
}
|
||||
out.printf("# n bases covered: %d%n", nBasesCovered);
|
||||
out.printf("# sites: %d%n", nSNPs);
|
||||
out.printf("# variant rate: %.5f confident variants per base%n", nSNPs / (1.0 * nBasesCovered));
|
||||
out.printf("# variant rate: 1 / %d confident variants per base%n", nBasesCovered / nSNPs);
|
||||
out.printf("# transitions: %d%n", nTransitions);
|
||||
out.printf("# transversions: %d%n", nTransversions);
|
||||
out.printf("# ratio: %.2f%n", nTransitions / (1.0 * nTransversions));
|
||||
}
|
||||
|
||||
// dbSNP stats
|
||||
out.println(dbSNPStats.toSingleLineString("#"));
|
||||
out.print(dbSNPStats.toMultiLineString("#"));
|
||||
|
||||
int[] pairCounts = new int[pairWiseBoundries.length];
|
||||
Arrays.fill(pairCounts, 0);
|
||||
for ( long dist : pairWiseDistances ) {
|
||||
boolean done = false;
|
||||
for ( int i = 0; i < pairWiseBoundries.length && ! done ; i++ ) {
|
||||
int maxDist = pairWiseBoundries[i];
|
||||
if ( dist < maxDist ) {
|
||||
pairCounts[i]++;
|
||||
done = true;
|
||||
}
|
||||
private void printAnalysisSet( final String analysisSetName ) {
|
||||
Date now = new Date();
|
||||
for ( VariantAnalysis analysis : getAnalysisSet(analysisSetName) ) {
|
||||
PrintStream stream = analysis.getPrintStream(); // getAnalysisOutputStream(analysisSetName + "." + analysis.getName(), analysis.getParams());
|
||||
stream.printf("%s%s%n", COMMENT_STRING, Utils.dupString('-', 78));
|
||||
stream.printf("%sAnalysis set %s%n", COMMENT_STRING, analysisSetName);
|
||||
stream.printf("%sAnalysis name %s%n", COMMENT_STRING, analysis.getName());
|
||||
stream.printf("%sAnalysis params %s%n", COMMENT_STRING, Utils.join(" ", analysis.getParams()));
|
||||
stream.printf("%sAnalysis class %s%n", COMMENT_STRING, analysis );
|
||||
stream.printf("%sAnalysis time %s%n", COMMENT_STRING, now );
|
||||
for ( String line : analysis.done()) {
|
||||
stream.printf("%s %s%n", COMMENT_STRING, line);
|
||||
}
|
||||
}
|
||||
|
||||
out.printf("# snps counted for pairwise distance: %d%n", pairWiseDistances.size());
|
||||
for ( int i = 0; i < pairWiseBoundries.length; i++ ) {
|
||||
int maxDist = pairWiseBoundries[i];
|
||||
int count = pairCounts[i];
|
||||
out.printf("# snps within %d bp: %d%n", maxDist, count);
|
||||
}
|
||||
|
||||
out.print(s);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,34 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
|
||||
public class VariantMatcher extends BasicVariantAnalysis {
|
||||
String dbName;
|
||||
|
||||
public VariantMatcher(final String name) {
|
||||
super("variant_matches");
|
||||
dbName = name;
|
||||
}
|
||||
|
||||
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
String r = null;
|
||||
AllelicVariant db = (AllelicVariant)tracker.lookup(dbName, null);
|
||||
|
||||
if ( eval != null || db != null ) {
|
||||
String matchFlag = " ";
|
||||
if ( eval != null && db != null ) matchFlag = "*** ";
|
||||
if ( eval == null && db != null ) matchFlag = ">>> ";
|
||||
if ( eval != null && db == null ) matchFlag = "<<< ";
|
||||
|
||||
r = String.format("%s %s: %s <=> EVAL: %s", dbName, matchFlag, db, eval);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
|
@ -84,6 +84,35 @@ public class MathUtils {
|
|||
//return (new cern.jet.random.Binomial(n, p, cern.jet.random.engine.RandomEngine.makeDefault())).pdf(k);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes a multinomial. This is computed using the formula
|
||||
*
|
||||
* M(x1,x2,...,xk; n) = [ n! / (x1! x2! ... xk!) ]
|
||||
*
|
||||
* where xi represents the number of times outcome i was observed, n is the number of total observations.
|
||||
* In this implementation, the value of n is inferred as the sum over i of xi.
|
||||
*
|
||||
* @param x an int[] of counts, where each element represents the number of times a certain outcome was observed
|
||||
* @return the multinomial of the specified configuration.
|
||||
*/
|
||||
public static double multinomial(int[] x) {
|
||||
// In order to avoid overflow in computing large factorials in the multinomial
|
||||
// coefficient, we split the calculation up into the product of a bunch of
|
||||
// binomial coefficients.
|
||||
|
||||
double multinomialCoefficient = 1.0;
|
||||
|
||||
for (int i = 0; i < x.length; i++) {
|
||||
int n = 0;
|
||||
for (int j = 0; j <= i; j++) { n += x[j]; }
|
||||
|
||||
double multinomialTerm = Arithmetic.binomial(n, x[i]);
|
||||
multinomialCoefficient *= multinomialTerm;
|
||||
}
|
||||
|
||||
return multinomialCoefficient;
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes a multinomial probability. This is computed using the formula
|
||||
*
|
||||
|
|
@ -101,16 +130,7 @@ public class MathUtils {
|
|||
// In order to avoid overflow in computing large factorials in the multinomial
|
||||
// coefficient, we split the calculation up into the product of a bunch of
|
||||
// binomial coefficients.
|
||||
|
||||
double multinomialCoefficient = 1.0;
|
||||
|
||||
for (int i = 0; i < x.length; i++) {
|
||||
int n = 0;
|
||||
for (int j = 0; j <= i; j++) { n += x[j]; }
|
||||
|
||||
double multinomialTerm = Arithmetic.binomial(n, x[i]);
|
||||
multinomialCoefficient *= multinomialTerm;
|
||||
}
|
||||
double multinomialCoefficient = multinomial(x);
|
||||
|
||||
double probs = 1.0, totalprob = 0.0;
|
||||
for (int obsCountsIndex = 0; obsCountsIndex < x.length; obsCountsIndex++) {
|
||||
|
|
|
|||
|
|
@ -43,6 +43,13 @@ public class Utils {
|
|||
throw new RuntimeException(msg);
|
||||
}
|
||||
|
||||
public static <T> List<T> cons(final T elt, final List<T> l) {
|
||||
List<T> l2 = new ArrayList<T>();
|
||||
l2.add(elt);
|
||||
if ( l != null ) l2.addAll(l);
|
||||
return l2;
|
||||
}
|
||||
|
||||
/**
|
||||
* pretty print the warning message supplied
|
||||
* @param message the message
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
|
|||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.TabularROD;
|
||||
import org.broadinstitute.sting.gatk.refdata.RODIterator;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
|
|
@ -56,7 +57,7 @@ public class IteratorPoolTest extends BaseTest {
|
|||
@Test
|
||||
public void testCreateSingleIterator() {
|
||||
IteratorPool iteratorPool = new IteratorPool(rod);
|
||||
ReferenceOrderedData.RODIterator iterator = (ReferenceOrderedData.RODIterator)iteratorPool.iterator( testSite1 );
|
||||
RODIterator iterator = (RODIterator)iteratorPool.iterator( testSite1 );
|
||||
|
||||
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
|
||||
Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators());
|
||||
|
|
@ -77,10 +78,10 @@ public class IteratorPoolTest extends BaseTest {
|
|||
@Test
|
||||
public void testCreateMultipleIterators() {
|
||||
IteratorPool iteratorPool = new IteratorPool(rod);
|
||||
ReferenceOrderedData.RODIterator iterator1 = (ReferenceOrderedData.RODIterator)iteratorPool.iterator( testSite1 );
|
||||
RODIterator iterator1 = (RODIterator)iteratorPool.iterator( testSite1 );
|
||||
|
||||
// Create a new iterator at position 2.
|
||||
ReferenceOrderedData.RODIterator iterator2 = iteratorPool.iterator( testSite2 );
|
||||
RODIterator iterator2 = iteratorPool.iterator( testSite2 );
|
||||
|
||||
Assert.assertEquals("Number of iterators in the pool is incorrect", 2, iteratorPool.numIterators());
|
||||
Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators());
|
||||
|
|
@ -127,7 +128,7 @@ public class IteratorPoolTest extends BaseTest {
|
|||
@Test
|
||||
public void testIteratorConservation() {
|
||||
IteratorPool iteratorPool = new IteratorPool(rod);
|
||||
ReferenceOrderedData.RODIterator iterator = (ReferenceOrderedData.RODIterator)iteratorPool.iterator( testSite1 );
|
||||
RODIterator iterator = (RODIterator)iteratorPool.iterator( testSite1 );
|
||||
|
||||
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
|
||||
Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators());
|
||||
|
|
@ -162,7 +163,7 @@ public class IteratorPoolTest extends BaseTest {
|
|||
@Test
|
||||
public void testIteratorCreation() {
|
||||
IteratorPool iteratorPool = new IteratorPool(rod);
|
||||
ReferenceOrderedData.RODIterator iterator = (ReferenceOrderedData.RODIterator)iteratorPool.iterator( testSite3 );
|
||||
RODIterator iterator = (RODIterator)iteratorPool.iterator( testSite3 );
|
||||
|
||||
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
|
||||
Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators());
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ import java.util.ArrayList;
|
|||
public class TabularRODTest extends BaseTest {
|
||||
private static FastaSequenceFile2 seq;
|
||||
private ReferenceOrderedData ROD;
|
||||
private ReferenceOrderedData.RODIterator iter;
|
||||
private RODIterator iter;
|
||||
|
||||
|
||||
@BeforeClass
|
||||
|
|
@ -113,7 +113,7 @@ public class TabularRODTest extends BaseTest {
|
|||
public void testDelim1() {
|
||||
File file2 = new File(testDir + "TabularDataTest2.dat");
|
||||
ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", file2, TabularROD.class);
|
||||
ReferenceOrderedData.RODIterator iter_commas = ROD_commas.iterator();
|
||||
RODIterator iter_commas = ROD_commas.iterator();
|
||||
|
||||
logger.warn("Executing testDelim1");
|
||||
TabularROD one2 = (TabularROD)iter_commas.next();
|
||||
|
|
@ -130,7 +130,7 @@ public class TabularRODTest extends BaseTest {
|
|||
TabularROD.setDelimiter(",",",");
|
||||
File file2 = new File(testDir + "TabularDataTest2.dat");
|
||||
ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", file2, TabularROD.class);
|
||||
ReferenceOrderedData.RODIterator iter_commas = ROD_commas.iterator();
|
||||
RODIterator iter_commas = ROD_commas.iterator();
|
||||
|
||||
logger.warn("Executing testDelim1");
|
||||
TabularROD one2 = (TabularROD)iter_commas.next();
|
||||
|
|
@ -173,7 +173,7 @@ public class TabularRODTest extends BaseTest {
|
|||
out.println(row.toString());
|
||||
|
||||
ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", outputFile, TabularROD.class);
|
||||
ReferenceOrderedData.RODIterator iter_commas = ROD_commas.iterator();
|
||||
RODIterator iter_commas = ROD_commas.iterator();
|
||||
|
||||
TabularROD one = (TabularROD)iter_commas.next();
|
||||
assertTrue(one.size() == 3);
|
||||
|
|
|
|||
|
|
@ -63,6 +63,7 @@ def main():
|
|||
if filter(lambda x: x <> 0, jobids) <> []:
|
||||
# there's still work to do
|
||||
sys.exit('Stopping. Please rerun this program when the farm jobs are complete: ' + str(jobids))
|
||||
# TODO: Should add a wait here for all farm jobs to finish...
|
||||
print 'gelis', gelis
|
||||
print 'jobids', jobids
|
||||
else:
|
||||
|
|
@ -76,8 +77,11 @@ def main():
|
|||
dbsnpFile = geli2dbsnpFile(geli)
|
||||
if not os.path.exists(dbsnpFile):
|
||||
dbsnpCmd = picard_utils.CollectDbSnpMatchesCmd(geli, dbsnpFile, OPTIONS.lod)
|
||||
if jobid == 0: jobid = None
|
||||
farm_commands.cmd(dbsnpCmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid)
|
||||
|
||||
# TODO: Should add a wait here for all farm jobs to finish...
|
||||
|
||||
# read in the dbSNP tracks
|
||||
nTotalSnps = 0
|
||||
nNovelSnps = 0
|
||||
|
|
@ -99,11 +103,12 @@ def main():
|
|||
jobid = None
|
||||
variantsOut = map( lambda geli: os.path.split(geli)[1] + '.calls', gelis)
|
||||
for geli, variantOut in zip(gelis, variantsOut):
|
||||
name = os.path.split(geli)[1]
|
||||
if not os.path.exists(variantOut):
|
||||
cmd = ("GeliToText.jar I=%s | awk '$7 > %f' > %s" % ( geli, OPTIONS.lod, variantOut) )
|
||||
cmd = ("GeliToText.jar I=%s | awk '$1 !~ \"@\" && $1 !~ \"#Sequence\" && $0 !~ \"GeliToText\"' | awk '$7 > %f {print \"%s\" $0}' > %s" % ( geli, OPTIONS.lod, name, variantOut) )
|
||||
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry)
|
||||
|
||||
cmd = ("cat %s | awk '$1 !~ \"@\" && $1 !~ \"#Sequence\" && $0 !~ \"GeliToText\"' | sort -k 1 -k 2 -n > tmp.calls" % ( ' '.join(variantsOut) ) )
|
||||
cmd = ("cat %s | sort -k 1 -k 2 -n > tmp.calls" % ( ' '.join(variantsOut) ) )
|
||||
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid)
|
||||
|
||||
sortedCallFile = 'all.sorted.calls'
|
||||
|
|
@ -113,14 +118,14 @@ def main():
|
|||
sortedCalls = [line.split() for line in open(sortedCallFile)]
|
||||
aggregratedCalls = picard_utils.aggregateGeliCalls(sortedCalls)
|
||||
|
||||
print >> outputFile, 'loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom'
|
||||
print >> outputFile, 'loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom individuals'
|
||||
|
||||
for snp in map( lambda x: picard_utils.aggregatedGeliCalls2SNP(x, nIndividuals), aggregratedCalls ):
|
||||
if snp == None: continue # ignore bad calls
|
||||
#print snp
|
||||
#sharedCalls = list(sharedCallsGroup)
|
||||
#genotype = list(sharedCalls[0][5])
|
||||
print >> outputFile, '%s %s %s %.6f -420.0 -420.0 0.000000 100.0 %d %d %d %d' % (snp.loc, snp.ref, snp.alt(), snp.q(), nIndividuals, snp.nRefGenotypes(), snp.nHetGenotypes(), snp.nHomVarGenotypes())
|
||||
print >> outputFile, '%s %s %s %.6f -420.0 -420.0 0.000000 100.0 %d %d %d %d NA' % (snp.loc, snp.ref, snp.alt(), snp.q(), nIndividuals, snp.nRefGenotypes(), snp.nHetGenotypes(), snp.nHomVarGenotypes())
|
||||
outputFile.close()
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ def cmd(cmd_str_from_user, farm_queue=False, output_head=None, just_print_comman
|
|||
if waitID <> None:
|
||||
cmd_str += " -w \"done(%s)\"" % (str(waitID))
|
||||
|
||||
cmd_str += " "+cmd_str_from_user
|
||||
cmd_str += " \""+cmd_str_from_user + "\""
|
||||
|
||||
print ">>> Farming via "+cmd_str
|
||||
|
||||
|
|
|
|||
|
|
@ -55,13 +55,22 @@ def genotypes2allelefrequencies(ref, genotypes, nIndividuals = -1):
|
|||
nAltChroms = nChroms - nRefChroms
|
||||
p = float(nRefChroms) / nChroms
|
||||
q = float(nAltChroms) / nChroms
|
||||
|
||||
#print 'genotypes', genotypes
|
||||
#print 'alleles', alleles
|
||||
#print 'nChroms', nChroms
|
||||
#print 'nCalledChroms', nCalledChroms
|
||||
#print 'nMissingChroms', nMissingChroms
|
||||
#print 'nRefChroms', nRefChroms
|
||||
#print 'nAltChroms', nAltChroms
|
||||
#print 'p, q', p, q
|
||||
|
||||
assert p + q == 1
|
||||
|
||||
return [p, q, n]
|
||||
|
||||
class PicardSNP:
|
||||
def __init__( self, loc, ref, polymorphism, heterozygosity, allelefrequencies, genotypes):
|
||||
def __init__( self, loc, ref, polymorphism, heterozygosity, allelefrequencies, genotypes, sources):
|
||||
self.loc = loc
|
||||
self.ref = ref
|
||||
self.polymorphism = polymorphism
|
||||
|
|
@ -69,6 +78,7 @@ class PicardSNP:
|
|||
self.nIndividuals = allelefrequencies[2]
|
||||
self.allelefrequencies = allelefrequencies
|
||||
self.genotypes = genotypes
|
||||
self.sources = sources
|
||||
|
||||
def refGenotype(self):
|
||||
return self.ref + self.ref
|
||||
|
|
@ -92,7 +102,7 @@ class PicardSNP:
|
|||
return self.allelefrequencies[1]
|
||||
|
||||
def countGenotype(self, genotype):
|
||||
r = len(filter( lambda x: x == genotype, self.genotypes ))
|
||||
r = len(filter( lambda x: sorted(x) == sorted(genotype), self.genotypes ))
|
||||
#print 'countGenotype', genotype, self.genotypes, r
|
||||
return r
|
||||
|
||||
|
|
@ -105,7 +115,6 @@ class PicardSNP:
|
|||
def nHomVarGenotypes(self):
|
||||
return self.countGenotype(self.homVarGenotype())
|
||||
|
||||
|
||||
def __str__(self):
|
||||
return '%s %s %s %s %s %s' % ( self.loc, self.ref, str(self.polymorphism), str(self.het()), str(self.allelefrequencies), str(self.genotypes))
|
||||
|
||||
|
|
@ -132,7 +141,7 @@ def aggregatedGeliCalls2SNP( geliCallsAtSite, nIndividuals ):
|
|||
#print 'polymorphism', polymorphism
|
||||
genotype = list(geliCallsAtSite[1][0][5])
|
||||
|
||||
return PicardSNP(loc, refBase, polymorphism, genotypes2heterozygosity(genotypes, nIndividuals), genotypes2allelefrequencies(refBase, genotypes, nIndividuals), genotypes)
|
||||
return PicardSNP(loc, refBase, polymorphism, genotypes2heterozygosity(genotypes, nIndividuals), genotypes2allelefrequencies(refBase, genotypes, nIndividuals), genotypes, [])
|
||||
|
||||
#return '%s %s %s 0.002747 -411.622578 -420.661738 0.000000 9.039160 364.000000 %d 1 0' % (loc, genotype[0], genotype[1], len(geliCallsAtSite))
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue