diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index ea4536538..f385609b4 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -8,12 +8,11 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.executive.MicroScheduler; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.RODIterator; +import org.broadinstitute.sting.gatk.refdata.IntervalRodIterator; import org.broadinstitute.sting.gatk.traversals.*; import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.cmdLine.ArgumentException; import java.util.ArrayList; @@ -77,6 +76,8 @@ public class GenomeAnalysisEngine { bindConvenienceRods("hapmap", "HapMapAlleleFrequencies", argCollection.HAPMAPFile); if (argCollection.HAPMAPChipFile != null) bindConvenienceRods("hapmap-chip", "GFF", argCollection.HAPMAPChipFile); + if ( argCollection.intervals != null ) + bindConvenienceRods("interval", "Intervals", argCollection.intervals); // parse out the rod bindings ReferenceOrderedData.parseBindings(logger, argCollection.RODBindings, rods); @@ -114,7 +115,8 @@ public class GenomeAnalysisEngine { genericEngineSetup(strictness); // parse out any genomic location they've provided - List locationsList = setupIntervalRegion(); + //List locationsList = setupIntervalRegion(); + List locationsList = engine.getLocations(); GenomeLocSortedSet locs = null; if (locationsList != null) locs = GenomeLocSortedSet.createSetFromList(locationsList); @@ -202,7 +204,7 @@ public class GenomeAnalysisEngine { // we default interval files over the genome region string if (argCollection.intervals != null) { - engine.setLocation(setupIntervalRegion()); + engine.setLocation(parseIntervalRegion(argCollection.intervals, false)); } engine.setReadFilters(new Reads(argCollection)); @@ -217,15 +219,15 @@ public class GenomeAnalysisEngine { * * @return a list of genomeLoc representing the interval file */ - private List setupIntervalRegion() { + public static List parseIntervalRegion(final String intervalsString, boolean quiet ) { List locs = null; - if (argCollection.intervals != null) { - if (new File(argCollection.intervals).exists()) { - logger.info("Intervals argument specifies a file. Loading intervals from file."); - locs = GenomeLoc.IntervalFileToList(argCollection.intervals); + if ( intervalsString != null) { + if (new File(intervalsString).exists()) { + if (! quiet) logger.info("Intervals argument specifies a file. Loading intervals from file."); + locs = GenomeLoc.IntervalFileToList(intervalsString); } else { - logger.info("Intervals argument does not specify a file. Trying to parse it as a simple string."); - locs = GenomeLoc.parseGenomeLocs(argCollection.intervals); + if (! quiet) logger.info("Intervals argument does not specify a file. Trying to parse it as a simple string."); + locs = GenomeLoc.parseGenomeLocs(intervalsString); } } return locs; diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ReferenceOrderedView.java b/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ReferenceOrderedView.java index 4ffb96471..4ba75b5be 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ReferenceOrderedView.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ReferenceOrderedView.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.dataSources.providers; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RODIterator; import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.utils.GenomeLoc; @@ -43,7 +44,7 @@ public class ReferenceOrderedView implements View { public ReferenceOrderedView( ShardDataProvider provider ) { this.provider = provider; for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() ) - states.add( new ReferenceOrderedDataState( dataSource, (ReferenceOrderedData.RODIterator)dataSource.seek(provider.getShard()) ) ); + states.add( new ReferenceOrderedDataState( dataSource, (RODIterator)dataSource.seek(provider.getShard()) ) ); provider.register(this); } @@ -78,9 +79,9 @@ public class ReferenceOrderedView implements View { */ private class ReferenceOrderedDataState { public final ReferenceOrderedDataSource dataSource; - public final ReferenceOrderedData.RODIterator iterator; + public final RODIterator iterator; - public ReferenceOrderedDataState( ReferenceOrderedDataSource dataSource, ReferenceOrderedData.RODIterator iterator ) { + public ReferenceOrderedDataState( ReferenceOrderedDataSource dataSource, RODIterator iterator ) { this.dataSource = dataSource; this.iterator = iterator; } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/IteratorPool.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/IteratorPool.java index 01cf6abcc..dcfeb1382 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/IteratorPool.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/IteratorPool.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.dataSources.simpleDataSources; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; +import org.broadinstitute.sting.gatk.refdata.RODIterator; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; @@ -30,12 +31,12 @@ class IteratorPool { /** * All iterators of this reference-ordered data. */ - private List allIterators = new ArrayList(); + private List allIterators = new ArrayList(); /** * All iterators that are not currently in service. */ - private List availableIterators = new ArrayList(); + private List availableIterators = new ArrayList(); /** * Create a new iterator pool given the current ROD. @@ -50,10 +51,10 @@ class IteratorPool { * @param position Target position for the iterator. * @return */ - public ReferenceOrderedData.RODIterator iterator( GenomeLoc position ) { + public RODIterator iterator( GenomeLoc position ) { // Grab the first iterator in the list whose position is before the requested position. - ReferenceOrderedData.RODIterator selectedIterator = null; - for( ReferenceOrderedData.RODIterator iterator: availableIterators ) { + RODIterator selectedIterator = null; + for( RODIterator iterator: availableIterators ) { if( (iterator.position() == null && iterator.hasNext()) || (iterator.position() != null && iterator.position().isBefore(position)) ) { selectedIterator = iterator; @@ -79,7 +80,7 @@ class IteratorPool { * Close the given iterator, returning it to the pool. * @param iterator Iterator to return to the pool. */ - public void close( ReferenceOrderedData.RODIterator iterator ) { + public void close( RODIterator iterator ) { if( !allIterators.contains(iterator) ) throw new StingException("Iterator does not belong to the given pool."); availableIterators.add(iterator); diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceOrderedDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceOrderedDataSource.java index ed75a7aca..8090ef4d5 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceOrderedDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceOrderedDataSource.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.dataSources.simpleDataSources; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; +import org.broadinstitute.sting.gatk.refdata.RODIterator; import org.broadinstitute.sting.gatk.dataSources.shards.Shard; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; @@ -59,7 +60,7 @@ public class ReferenceOrderedDataSource implements SimpleDataSource { * @return Iterator through the data. */ public Iterator seek( Shard shard ) { - ReferenceOrderedData.RODIterator iterator = iteratorPool.iterator(shard.getGenomeLoc()); + RODIterator iterator = iteratorPool.iterator(shard.getGenomeLoc()); return iterator; } @@ -67,7 +68,7 @@ public class ReferenceOrderedDataSource implements SimpleDataSource { * Close the specified iterator, returning it to the pool. * @param iterator Iterator to close. */ - public void close( ReferenceOrderedData.RODIterator iterator ) { + public void close( RODIterator iterator ) { this.iteratorPool.close(iterator); } } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/IntervalRod.java b/java/src/org/broadinstitute/sting/gatk/refdata/IntervalRod.java new file mode 100755 index 000000000..f02f1f2a4 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/IntervalRod.java @@ -0,0 +1,41 @@ +package org.broadinstitute.sting.gatk.refdata; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; + +import java.util.Iterator; +import java.io.File; +import java.io.IOException; + +public class IntervalRod extends BasicReferenceOrderedDatum { + private GenomeLoc loc = null; + + public IntervalRod(String name) { + super(name); + } + + public IntervalRod(String name, GenomeLoc loc) { + super(name); + this.loc = loc; + } + + public GenomeLoc getLocation() { return loc; } + + /** Required by ReferenceOrderedDatum interface; this method does nothing (always returns false), + * since this rod provides its own iterator for reading underlying data files. + */ + public boolean parseLine(Object header, String[] parts) { + return false; // this rod has its own iterator + } + + public String repl() { + throw new StingException("repl() is not implemented yet"); + } + + public String toSimpleString() { return loc.toString(); } + public String toString() { return toSimpleString(); } + + public static IntervalRodIterator createIterator(String trackName, File f) throws IOException { + return IntervalRodIterator.IntervalRodIteratorFromLocsFile(trackName, f); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/IntervalRodIterator.java b/java/src/org/broadinstitute/sting/gatk/refdata/IntervalRodIterator.java new file mode 100755 index 000000000..fbc33a4be --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/IntervalRodIterator.java @@ -0,0 +1,52 @@ +package org.broadinstitute.sting.gatk.refdata; + +import java.io.File; +import java.io.IOException; +import java.util.*; + +import org.broadinstitute.sting.gatk.refdata.BasicReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.Pair; + +public class IntervalRodIterator implements Iterator { + private List locations = null; + private Iterator iter; + private String trackName = null; + + //public static RODIterator IntervalRodIteratorFromLocs(List locs) { + // IntervalRodIterator it = new IntervalRodIterator("interval", locs); + // return new RODIterator(it); + //} + + public static IntervalRodIterator IntervalRodIteratorFromLocsFile(final String trackName, final File file) { + //System.out.printf("Parsing %s for intervals %s%n", file, trackName); + List locs = GenomeAnalysisEngine.parseIntervalRegion(file.getPath(), true); + //System.out.printf(" => got %d entries %n", locs.size()); + return new IntervalRodIterator(trackName, locs); + } + + public IntervalRodIterator(String trackName, List locs) { + this.trackName = trackName; + locations = locs; + iter = locations.iterator(); + } + + @Override + public boolean hasNext() { + return iter.hasNext(); + } + + @Override + public IntervalRod next() { + IntervalRod r = new IntervalRod(trackName, iter.next()); + //System.out.printf("IntervalRod next is %s%n", r); + return r; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/PooledEMSNPROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/PooledEMSNPROD.java index 9230d1b01..33a2f9ffc 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/PooledEMSNPROD.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/PooledEMSNPROD.java @@ -22,7 +22,7 @@ import org.apache.log4j.Logger; * chr1:1104842 A N 0.000000 -84.816002 -84.816002 0.000000 0.000000 324.000000 162 0 0 * */ -public class PooledEMSNPROD extends TabularROD implements AllelicVariant { +public class PooledEMSNPROD extends TabularROD implements SNPCallFromGenotypes { public PooledEMSNPROD(final String name) { super(name); } @@ -45,4 +45,11 @@ public class PooledEMSNPROD extends TabularROD implements AllelicVariant { public List getGenotype() throws IllegalStateException { throw new IllegalStateException(); } public int getPloidy() throws IllegalStateException { return 2; } public boolean isBiallelic() { return true; } + + // SNPCallFromGenotypes interface + public int nIndividuals() { return Integer.parseInt(this.get("EM_N")); } + public int nHomRefGenotypes() { return Integer.parseInt(this.get("n_ref")); } + public int nHetGenotypes() { return Integer.parseInt(this.get("n_het")); } + public int nHomVarGenotypes() { return Integer.parseInt(this.get("n_hom")); } + public List getGenotypes() { return null; } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RODIterator.java b/java/src/org/broadinstitute/sting/gatk/refdata/RODIterator.java new file mode 100755 index 000000000..404ca6fcd --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/RODIterator.java @@ -0,0 +1,95 @@ +package org.broadinstitute.sting.gatk.refdata; + +import org.broadinstitute.sting.gatk.iterators.PushbackIterator; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.Iterator; + +public class RODIterator implements Iterator { + private PushbackIterator it; + private ROD current = null; + private GenomeLoc position = null; + + public RODIterator(Iterator it) { + this.it = new PushbackIterator(it); + } + + public boolean hasNext() { return it.hasNext(); } + public ROD next() { + ROD next = it.next(); + if( next != null ) { + position = next.getLocation().clone(); + current = next; + } + return next; + } + + /** + * Returns the current position of this iterator. + * @return Current position of the iterator, or null if no position exists. + */ + public GenomeLoc position() { + return position; + } + + /** + * Seeks forward in the file until we reach (or cross) a record at contig / pos + * If we don't find anything and cross beyond contig / pos, we return null + * Otherwise we return the first object who's start is at pos + * + * @param loc + * @return + */ + public ROD seekForward(final GenomeLoc loc) { + final boolean DEBUG = false; + + ROD result = null; + + //if (current != null && current.getName().equals("interval")) { + // boolean contains = current.getLocation().containsP(loc); + // System.out.printf(" %s : current is %s, seeking to %s, contains %b%n", current.getName(), current.getLocation(), loc, contains); + //} + + if ( current != null && current.getLocation().containsP(loc) ) + return current; + + if ( DEBUG ) System.out.printf(" *** starting seek to %s %d%n", loc.getContig(), loc.getStart()); + while ( hasNext() ) { + ROD proposed = next(); + if( proposed == null ) + continue; + //System.out.printf(" -> Seeking to %s %d AT %s %d%n", contigName, pos, current.getContig(), current.getStart()); + boolean containedP = proposed.getLocation().containsP(loc); + //System.out.printf(" %s -> Seeking to %s, at %s => contains = %b%n", current.getName(), loc, current.getLocation(), containedP); + int cmp = proposed.getLocation().compareTo(loc); + if ( cmp < 0 ) { + // current occurs before loc, continue searching + continue; + } + else if ( cmp == 0 || containedP ) { + result = proposed; + break; + } else { + // current is after loc + it.pushback(proposed); + break; + } + } + + if ( DEBUG ) { + if ( result != null ) + System.out.printf(" ### Found %s%n", result.getLocation()); + } + + // make a note that the iterator last seeked to the specified position + current = result; + position = loc.clone(); + + // we ran out of elements or found something + return result; + } + + public void remove() { + throw new UnsupportedOperationException(); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java index d84dd7975..0b3074693 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -94,7 +94,7 @@ public class RefMetaDataTracker { * @param rod */ public void bind(final String name, ReferenceOrderedDatum rod) { - logger.debug(String.format("Binding %s to %s%n", name, rod)); + //logger.debug(String.format("Binding %s to %s", name, rod)); map.put(canonicalName(name), rod); } } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java index edfed5e50..952e1e303 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java @@ -66,6 +66,7 @@ public class ReferenceOrderedData implements addModule("RefSeq", rodRefSeq.class); addModule("Table", TabularROD.class); addModule("PooledEM", PooledEMSNPROD.class); + addModule("Intervals", IntervalRod.class); } @@ -159,7 +160,7 @@ public class ReferenceOrderedData implements return this.name.equals(name) && type.isAssignableFrom(this.type); } - public RODIterator iterator() { + public RODIterator iterator() { Iterator it; try { Method m = type.getDeclaredMethod("createIterator", String.class,java.io.File.class); @@ -177,7 +178,7 @@ public class ReferenceOrderedData implements } catch ( java.lang.reflect.InvocationTargetException e ) { throw new RuntimeException(e); } - return new RODIterator(it); + return new RODIterator(it); } // ---------------------------------------------------------------------- @@ -301,103 +302,6 @@ public class ReferenceOrderedData implements } } -// private class SimpleRODIterator implements Iterator { -// //private WhitespaceTextFileParser parser = null; -// private TabbedTextFileParser parser = null; -// -// public SimpleRODIterator() { -// parser = new TabbedTextFileParser(true, file); -// } -// -// public boolean hasNext() { -// return parser.hasNext(); -// } -// -// public ROD next() { -// String parts[] = parser.next(); -// return parseLine(parts); -// } -// -// public void remove() { -// throw new UnsupportedOperationException(); -// } -// } - - public class RODIterator implements Iterator { - private PushbackIterator it; - private GenomeLoc position = null; - - public RODIterator(Iterator it) { - this.it = new PushbackIterator(it); - } - - public boolean hasNext() { return it.hasNext(); } - public ROD next() { - ROD next = it.next(); - if( next != null ) - position = next.getLocation().clone(); - return next; - } - - /** - * Returns the current position of this iterator. - * @return Current position of the iterator, or null if no position exists. - */ - public GenomeLoc position() { - return position; - } - - /** - * Seeks forward in the file until we reach (or cross) a record at contig / pos - * If we don't find anything and cross beyond contig / pos, we return null - * Otherwise we return the first object who's start is at pos - * - * @param loc - * @return - */ - public ROD seekForward(final GenomeLoc loc) { - final boolean DEBUG = false; - - ROD result = null; - - if ( DEBUG ) System.out.printf(" *** starting seek to %s %d%n", loc.getContig(), loc.getStart()); - while ( hasNext() ) { - ROD current = next(); - if( current == null ) - continue; - //System.out.printf(" -> Seeking to %s %d AT %s %d%n", contigName, pos, current.getContig(), current.getStart()); - int cmp = current.getLocation().compareTo(loc); - if ( cmp < 0 ) { - // current occurs before loc, continue searching - continue; - } - else if ( cmp == 0 ) { - result = current; - break; - } else { - // current is after loc - it.pushback(current); - break; - } - } - - if ( DEBUG ) { - if ( result != null ) - System.out.printf(" ### Found %s%n", result.getLocation()); - } - - // make a note that the iterator last seeked to the specified position - position = loc.clone(); - - // we ran out of elements or found something - return result; - } - - public void remove() { - throw new UnsupportedOperationException(); - } - } - // ---------------------------------------------------------------------- // // Parsing diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/SNPCallFromGenotypes.java b/java/src/org/broadinstitute/sting/gatk/refdata/SNPCallFromGenotypes.java new file mode 100755 index 000000000..6b9a190c7 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/SNPCallFromGenotypes.java @@ -0,0 +1,31 @@ +package org.broadinstitute.sting.gatk.refdata; + +import java.util.*; +import java.util.regex.MatchResult; +import java.util.regex.Pattern; +import java.util.regex.Matcher; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; + +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.xReadLines; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.apache.log4j.Logger; + +/** + * loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom + * chr1:1104840 A N 0.000000 -85.341265 -85.341265 0.000000 0.000000 324.000000 162 0 0 + * chr1:1104841 C N 0.000000 -69.937928 -69.937928 0.000000 0.000000 324.000000 162 0 0 + * chr1:1104842 A N 0.000000 -84.816002 -84.816002 0.000000 0.000000 324.000000 162 0 0 + * + */ +public interface SNPCallFromGenotypes extends AllelicVariant { + public int nIndividuals(); + public int nHomRefGenotypes(); + public int nHetGenotypes(); + public int nHomVarGenotypes(); + public List getGenotypes(); +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 7f029eeeb..159df73ca 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -11,9 +11,7 @@ import net.sf.samtools.SAMRecord; import net.sf.samtools.util.RuntimeIOException; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.iterators.*; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.gatk.dataSources.shards.Shard; import org.broadinstitute.sting.gatk.dataSources.providers.ShardDataProvider; @@ -22,7 +20,6 @@ import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2; import java.io.File; -import java.io.FileNotFoundException; import java.util.*; public abstract class TraversalEngine { @@ -30,7 +27,7 @@ public abstract class TraversalEngine { protected List> rods = null; // Iterator over rods - List.RODIterator>> rodIters; + List>> rodIters; // How strict should we be with SAM/BAM parsing? protected ValidationStringency strictness = ValidationStringency.STRICT; @@ -227,6 +224,10 @@ public abstract class TraversalEngine { return this.locs != null && !this.locs.isEmpty(); } + public List getLocations() { + return this.locs; + } + // -------------------------------------------------------------------------------------------------------------- // // printing @@ -446,9 +447,9 @@ public abstract class TraversalEngine { */ protected void initializeRODs() { // set up reference ordered data - rodIters = new ArrayList.RODIterator>>(); + rodIters = new ArrayList>>(); for (ReferenceOrderedData data : rods) { - rodIters.add(new Pair.RODIterator>(data.getName(), data.iterator())); + rodIters.add(new Pair>(data.getName(), data.iterator())); } } @@ -492,10 +493,11 @@ public abstract class TraversalEngine { */ protected RefMetaDataTracker getReferenceOrderedDataAtLocus(final GenomeLoc loc) { RefMetaDataTracker tracks = new RefMetaDataTracker(); - for (Pair.RODIterator> pair : rodIters) { + for (Pair> pair : rodIters) { String name = pair.getFirst(); tracks.bind(name, pair.getSecond().seekForward(loc)); } + return tracks; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/BasicVariantAnalysis.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/BasicVariantAnalysis.java new file mode 100755 index 000000000..99e27399d --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/BasicVariantAnalysis.java @@ -0,0 +1,42 @@ +package org.broadinstitute.sting.playground.gatk.walkers.varianteval; + +import org.broadinstitute.sting.gatk.refdata.AllelicVariant; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.LocusContext; + +import java.io.PrintStream; +import java.util.List; +import java.util.ArrayList; + +public abstract class BasicVariantAnalysis implements VariantAnalysis { + protected String name; + protected PrintStream out; + protected VariantEvalWalker master; + + public BasicVariantAnalysis(String name) { + this.name = name; + } + + public String getName() { + return name; + } + + public List getParams() { + return new ArrayList(); + } + + public void initialize(VariantEvalWalker master, PrintStream out) { + this.master = master; + this.out = out; + } + + public PrintStream getPrintStream() { + return out; + } + + public List done() { + return new ArrayList(); + } + + public abstract String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context); +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/HardyWeinbergEquilibrium.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/HardyWeinbergEquilibrium.java new file mode 100755 index 000000000..8e091ff0c --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/HardyWeinbergEquilibrium.java @@ -0,0 +1,85 @@ +package org.broadinstitute.sting.playground.gatk.walkers.varianteval; + +import org.broadinstitute.sting.gatk.refdata.AllelicVariant; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.SNPCallFromGenotypes; +import org.broadinstitute.sting.gatk.refdata.PooledEMSNPROD; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.MathUtils; + +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.List; + +import cern.jet.math.Arithmetic; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: Jun 4, 2009 + * Time: 4:38:00 PM + * To change this template use File | Settings | File Templates. + */ +public class HardyWeinbergEquilibrium extends BasicVariantAnalysis { + private double threshold; + int nSites = 0; + int nViolations = 0; + + HardyWeinbergEquilibrium(double threshold) { + super("hwe"); + this.threshold = threshold; + } + + public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) { + String r = null; + + if ( eval != null && + eval instanceof SNPCallFromGenotypes ) { + nSites++; + SNPCallFromGenotypes call = (SNPCallFromGenotypes)eval; + + int nAA = call.nHomRefGenotypes(); + int nAa = call.nHetGenotypes(); + int naa = call.nHomVarGenotypes(); + int nA = 2 * nAA + nAa; + int n = nAA + nAa + naa; + + // + // from Emigh 1980 + // + // P = pr[nAa | nA] = multinomial[n over nAA, nAa, naa] / binomial[2n over nA] * 2^nAa + // + // where nAA, nAa, naa are the observed numbers of the three genotypes, AA, Aa, and aa, + // respectively, and nA is the number of A alleles, where nA = 2nAA + nAa, and n is the number of alleles + // + int[] mXs = { nAA, nAa, naa }; // counts of each genotype as vector + double m = MathUtils.multinomial(mXs); + double b = Arithmetic.binomial(2 * n, nA); + double tosses = Math.pow(2, nAa); + double p = (m / b) * tosses; + + if ( false ) { + System.out.printf("HWE-violation at %s %f < %f %1.2f %5d %5d %5d %5d %5d %.2e %.2e %.2e => %.6e [%s]%n", + call.getLocation(), p, threshold, call.getMAF(), nAA, nAa, naa, nA, n, m, b, tosses, p, eval); + System.out.printf("(factorial(%d) / (factorial(%d) * factorial(%d) * factorial(%d))) / choose(%d, %d) * 2^%d - %f < 1e-3%n", + nAA + nAa + naa, nAA, nAa, naa, 2 * n, nA, nAa, p); + } + + if ( p < threshold ) { + r = String.format("HWE-violation %f < %f at %s", p, threshold, eval); + nViolations++; + } + } + + return r; + } + + public List done() { + List s = new ArrayList(); + s.add(String.format("n_calls %d", nSites)); + s.add(String.format("n_violations %d", nViolations)); + s.add(String.format("violations_rate %.2f", (100.0*nSites) / nViolations)); + return s; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/PairwiseDistanceAnalysis.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/PairwiseDistanceAnalysis.java new file mode 100755 index 000000000..97dc37375 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/PairwiseDistanceAnalysis.java @@ -0,0 +1,86 @@ +package org.broadinstitute.sting.playground.gatk.walkers.varianteval; + +import org.broadinstitute.sting.gatk.refdata.AllelicVariant; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.IntervalRod; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.io.PrintStream; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: Jun 4, 2009 + * Time: 4:38:19 PM + * To change this template use File | Settings | File Templates. + */ +public class PairwiseDistanceAnalysis extends BasicVariantAnalysis { + ArrayList pairWiseDistances; + int[] pairWiseBoundries = {1, 2, 5, 10, 20, 50, 100, 1000, 10000}; + + AllelicVariant lastVariant = null; + GenomeLoc lastVariantInterval = null; + + public PairwiseDistanceAnalysis() { + super("pairwise_distances"); + pairWiseDistances = new ArrayList(); + } + + public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) { + String r = null; + + if ( eval != null ) { + IntervalRod intervalROD = (IntervalRod)tracker.lookup("interval", null); + GenomeLoc interval = intervalROD == null ? null : intervalROD.getLocation(); + + if (lastVariant != null) { + GenomeLoc eL = eval.getLocation(); + GenomeLoc lvL = lastVariant.getLocation(); + if (eL.getContigIndex() == lvL.getContigIndex()) { + long d = eL.distance(lvL); + if ( lastVariantInterval != null && lastVariantInterval.compareTo(interval) != 0) { + // we're on different intervals + //out.printf("# Excluding %d %s %s vs. %s %s%n", d, eL, interval, lvL, lastVariantInterval); + } else { + pairWiseDistances.add(d); + r = String.format("Pairwise-distance %d %s %s%n", d, eL, lvL); + } + } + } + + lastVariant = eval; + lastVariantInterval = interval; + } + + return r; + } + + public List done() { + int[] pairCounts = new int[pairWiseBoundries.length]; + Arrays.fill(pairCounts, 0); + for ( long dist : pairWiseDistances ) { + boolean done = false; + for ( int i = 0; i < pairWiseBoundries.length && ! done ; i++ ) { + int maxDist = pairWiseBoundries[i]; + if ( dist <= maxDist ) { + pairCounts[i]++; + done = true; + } + } + } + + List s = new ArrayList(); + s.add(String.format("snps counted for pairwise distance: %d", pairWiseDistances.size())); + for ( int i = 0; i < pairWiseBoundries.length; i++ ) { + int maxDist = pairWiseBoundries[i]; + int count = pairCounts[i]; + s.add(String.format("snps within %8d bp: %d", maxDist, count)); + } + + return s; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/TransitionTranversionAnalysis.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/TransitionTranversionAnalysis.java new file mode 100755 index 000000000..434637a46 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/TransitionTranversionAnalysis.java @@ -0,0 +1,74 @@ +package org.broadinstitute.sting.playground.gatk.walkers.varianteval; + +import org.broadinstitute.sting.gatk.refdata.AllelicVariant; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.utils.BaseUtils; + +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: Jun 4, 2009 + * Time: 4:38:00 PM + * To change this template use File | Settings | File Templates. + */ +public class TransitionTranversionAnalysis extends BasicVariantAnalysis { + int N_TRANSITION_TRANVERSION_BINS = 100; + Histogram transitions; + Histogram transversions; + + public TransitionTranversionAnalysis() { + super("transitions_transversions"); + transitions = new Histogram(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0); + transversions = new Histogram(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0); + } + + public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) { + if ( eval != null && eval.isSNP() ) { + char refBase = eval.getRefSnpFWD(); + char altBase = eval.getAltSnpFWD(); + //System.out.printf("%c %c%n", refBase, altBase); + //int i = transition_transversion_bin(dbsnp.getHeterozygosity()); + //System.out.printf("MAF = %f => %d%n", dbsnp.getMAF(), i); + //EnumMap bin = transition_transversion_counts[i]; + + BaseUtils.BaseSubstitutionType subType = BaseUtils.SNPSubstitutionType(refBase, altBase); + Histogram h = subType == BaseUtils.BaseSubstitutionType.TRANSITION ? transitions : transversions; + double het = eval.getHeterozygosity(); + h.setBin(het, h.getBin(het) + 1); + //int sit = bin.get(BaseUtils.BaseSubstitutionType.TRANSITION); + //int ver = bin.get(BaseUtils.BaseSubstitutionType.TRANSVERSION); + //System.out.printf("%s %.2f %s%n", subType, dbsnp.getHeterozygosity(), h.x2bin(logHet), dbsnp.toString()); + } + + return null; + } + + public List done() { + int nTransitions = 0; + int nTransversions = 0; + + List s = new ArrayList(); + for ( int i = 0; i < N_TRANSITION_TRANVERSION_BINS; i++ ) { + //double avHet = Math.pow(10, transitions.bin2x(i)); + double avHet = transitions.bin2x(i); + if ( avHet > 0.5 ) break; + + int sit = transitions.getBin(i); + int ver = transversions.getBin(i); + nTransitions += sit; + nTransversions += ver; + double ratio = (float)sit/ver; + //s.append(String.format("%s %s %.2f %d %d %f%n", commentLine, prefix, avHet, sit, ver, ratio)); + } + + s.add(String.format("transitions %d", nTransitions)); + s.add(String.format("transversions %d", nTransversions)); + s.add(String.format("ratio %.2f", nTransitions / (1.0 * nTransversions))); + return s; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantAnalysis.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantAnalysis.java new file mode 100755 index 000000000..f6ffbbdd3 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantAnalysis.java @@ -0,0 +1,17 @@ +package org.broadinstitute.sting.playground.gatk.walkers.varianteval; + +import org.broadinstitute.sting.gatk.refdata.AllelicVariant; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.LocusContext; + +import java.io.PrintStream; +import java.util.List; + +public interface VariantAnalysis { + public String getName(); + public PrintStream getPrintStream(); + public List getParams(); + public void initialize(VariantEvalWalker master, PrintStream out); + public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context); + public List done(); +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantCounter.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantCounter.java new file mode 100755 index 000000000..5bffd4274 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantCounter.java @@ -0,0 +1,33 @@ +package org.broadinstitute.sting.playground.gatk.walkers.varianteval; + +import org.broadinstitute.sting.gatk.refdata.AllelicVariant; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.LocusContext; + +import java.io.PrintStream; +import java.util.List; +import java.util.ArrayList; + +public class VariantCounter extends BasicVariantAnalysis { + int nBasesCovered = 0; + int nSNPs = 0; + + public VariantCounter() { + super("variant_counts"); + } + + public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) { + nBasesCovered++; + nSNPs += eval == null ? 0 : 1; + return null; + } + + public List done() { + List s = new ArrayList(); + s.add(String.format("n bases covered: %d", nBasesCovered)); + s.add(String.format("sites: %d", nSNPs)); + s.add(String.format("variant rate: %.5f confident variants per base", nSNPs / (1.0 * nBasesCovered))); + s.add(String.format("variant rate: 1 / %d confident variants per base", nBasesCovered / nSNPs)); + return s; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantDBCoverage.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantDBCoverage.java index 22eae9e42..38bd7348e 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantDBCoverage.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantDBCoverage.java @@ -1,12 +1,22 @@ package org.broadinstitute.sting.playground.gatk.walkers.varianteval; -public class VariantDBCoverage { +import org.broadinstitute.sting.gatk.refdata.AllelicVariant; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.LocusContext; + +import java.io.PrintStream; +import java.util.List; +import java.util.Arrays; +import java.util.ArrayList; + +public class VariantDBCoverage extends BasicVariantAnalysis { private String dbName; private int nDBObs = 0; private int nEvalObs = 0; private int nOverlapping = 0; public VariantDBCoverage(final String name) { + super("db_coverage"); dbName = name; } @@ -37,6 +47,13 @@ public class VariantDBCoverage { return nOverlappingSites() / (1.0 * nEvalSites()); } + public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) { + // There are four cases here: + AllelicVariant dbsnp = (AllelicVariant)tracker.lookup(dbName, null); + inc(dbsnp != null, eval != null); + return dbsnp == null && eval != null ? "Novel " + eval : null; + } + /** * What fraction of the DB sites were discovered in the evalution calls? * @@ -46,18 +63,15 @@ public class VariantDBCoverage { return nOverlappingSites() / (1.0 * nDBSites()); } - public String toSingleLineString(final String prefix) { - return String.format("%s %d\t%d\t%d\t%.2f\t%.2f", prefix, nDBSites(), nEvalSites(), nOverlappingSites(), fractionEvalSitesCoveredByDB(), fractionDBSitesDiscoveredInEval()); - } - - public String toMultiLineString(final String prefix) { - StringBuilder s = new StringBuilder(); - s.append(String.format("%s name %s%n", prefix, dbName)); - s.append(String.format("%s n_db_sites %d%n", prefix, nDBSites())); - s.append(String.format("%s n_eval_sites %d%n", prefix, nEvalSites())); - s.append(String.format("%s n_overlapping_sites %d%n", prefix, nOverlappingSites())); - s.append(String.format("%s per_eval_sites_in_db %.2f%n", prefix, 100*fractionEvalSitesCoveredByDB())); - s.append(String.format("%s per_db_sites_in_eval %.2f%n", prefix, 100*fractionDBSitesDiscoveredInEval())); - return s.toString(); + public List done() { + List s = new ArrayList(); + s.add(String.format("%d\t%d\t%d\t%.2f\t%.2f", nDBSites(), nEvalSites(), nOverlappingSites(), fractionEvalSitesCoveredByDB(), fractionDBSitesDiscoveredInEval())); + s.add(String.format("name %s", dbName)); + s.add(String.format("n_db_sites %d", nDBSites())); + s.add(String.format("n_eval_sites %d", nEvalSites())); + s.add(String.format("n_overlapping_sites %d", nOverlappingSites())); + s.add(String.format("per_eval_sites_in_db %.2f", 100*fractionEvalSitesCoveredByDB())); + s.add(String.format("per_db_sites_in_eval %.2f", 100*fractionDBSitesDiscoveredInEval())); + return s; } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java index 14112ea40..2817d77b6 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java @@ -1,24 +1,16 @@ package org.broadinstitute.sting.playground.gatk.walkers.varianteval; import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.AllelicVariant; +import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.gatk.LocusContext; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.cmdLine.Argument; -import java.util.EnumMap; -import java.util.ArrayList; -import java.util.Arrays; +import java.util.*; +import java.io.*; -/** - * Created by IntelliJ IDEA. - * User: mdepristo - * Date: Feb 22, 2009 - * Time: 2:52:28 PM - * To change this template use File | Settings | File Templates. - */ @By(DataSource.REFERENCE) @Requires(DataSource.REFERENCE) @Allows(DataSource.REFERENCE) @@ -29,110 +21,114 @@ public class VariantEvalWalker extends RefWalker { @Argument(shortName="printVariants", doc="If true, prints the variants in all of the variant tracks that are examined", required=false) public boolean printVariants = false; - int nBasesCovered = 0; - int nSNPs = 0; - VariantDBCoverage dbSNPStats = new VariantDBCoverage("dbSNP"); + @Argument(shortName="badHWEThreshold", doc="XXX", required=false) + public double badHWEThreshold = 0.001; - int N_TRANSITION_TRANVERSION_BINS = 100; - Histogram transitions; - Histogram transversions; + String analysisFilenameBase = null; - @Argument(shortName="outputFilenameBase", doc="", required=false) - String outputFilenameBase = null; + String COMMENT_STRING = ""; - ArrayList pairWiseDistances; - int[] pairWiseBoundries = {1, 10, 100, 1000, 10000, 1000000000}; - AllelicVariant lastVariant = null; + HashMap> analysisSets; - //EnumMap transition_transversion_counts[]; + final String ALL_SNPS = "all"; + final String SINGLETON_SNPS = "singletons"; + final String TWOHIT_SNPS = "2plus_hit"; + final String[] ALL_ANALYSIS_NAMES = { ALL_SNPS, SINGLETON_SNPS, TWOHIT_SNPS }; public void initialize() { - //transitions = new Histogram(N_TRANSITION_TRANVERSION_BINS, Math.log10(1e-10), 0.0, 0); - //transversions = new Histogram(N_TRANSITION_TRANVERSION_BINS, Math.log10(1e-10), 0.0, 0); - transitions = new Histogram(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0); - transversions = new Histogram(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0); - pairWiseDistances = new ArrayList(); + // setup the path to the analysis + if ( this.getToolkit().getArguments().outFileName != null ) { + analysisFilenameBase = this.getToolkit().getArguments().outFileName + ".analysis."; + } + + analysisSets = new HashMap>(); + for ( String setName : ALL_ANALYSIS_NAMES ) { + analysisSets.put(setName, initializeAnalysisSet(setName)); + } } - //private int transition_transversion_bin(double het) { - // return (int)Math.floor(het * N_TRANSITION_TRANVERSION_BINS); - //} + private ArrayList getAnalysisSet(final String name) { + return analysisSets.get(name); + } - //private double transition_transversion_bin2het(int bin) { - // return (bin + 0.5) / N_TRANSITION_TRANVERSION_BINS; - //} + private ArrayList initializeAnalysisSet(final String setName) { + ArrayList analyses = new ArrayList(); + + // + // Add new analyzes here! + // + analyses.add(new VariantCounter()); + analyses.add(new VariantDBCoverage("dbSNP")); + analyses.add(new TransitionTranversionAnalysis()); + analyses.add(new PairwiseDistanceAnalysis()); + analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold)); + + + if ( printVariants ) analyses.add(new VariantMatcher("dbSNP")); + + for ( VariantAnalysis analysis : analyses ) { + analysis.initialize(this, openAnalysisOutputStream(setName, analysis)); + } + + return analyses; + } + + /** + * Returns the filename of the analysis output file where output for an analysis with + * @param name + * @param params + * @return + */ + public String getAnalysisFilename(final String name, final List params) { + if ( analysisFilenameBase == null ) + return null; + else + return analysisFilenameBase + Utils.join(".", Utils.cons(name, params)); + } + + public PrintStream openAnalysisOutputStream(final String setName, VariantAnalysis analysis) { + final String filename = getAnalysisFilename(setName + "." + analysis.getName(), analysis.getParams()); + if ( filename == null ) + return out; + else { + File file = new File(filename); + try { + return new PrintStream(new FileOutputStream(file)); + } catch (FileNotFoundException e) { + throw new StingException("Couldn't open analysis output file " + filename, e); + } + } + } public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) { - nBasesCovered++; - - AllelicVariant dbsnp = (AllelicVariant)tracker.lookup("dbSNP", null); + // Iterate over each analysis, and update it AllelicVariant eval = (AllelicVariant)tracker.lookup("eval", null); + if ( eval != null && eval.getVariationConfidence() < minDiscoveryQ ) + eval = null; - if ( printVariants && ( eval != null || dbsnp != null ) ) { - String matchFlag = " "; - if ( eval != null && dbsnp != null ) matchFlag = "*** "; - if ( eval == null && dbsnp != null ) matchFlag = ">>> "; - if ( eval != null && dbsnp == null ) matchFlag = "<<< "; + updateAnalysisSet(ALL_SNPS, eval, tracker, ref, context); - System.out.printf("%sDBSNP: %s%n%sEVAL:%s%n", - matchFlag, dbsnp, - matchFlag, eval); + if ( eval instanceof SNPCallFromGenotypes ) { + SNPCallFromGenotypes call = (PooledEMSNPROD)eval; + int nVarGenotypes = call.nHetGenotypes() + call.nHomVarGenotypes(); + //System.out.printf("%d variant genotypes at %s%n", nVarGenotypes, calls); + final String s = nVarGenotypes == 1 ? SINGLETON_SNPS : TWOHIT_SNPS; + updateAnalysisSet(s, eval, tracker, ref, context); } - if ( eval != null ) { - //System.out.printf("GREP ME%n"); - if ( ! eval.isSNP() || eval.getVariationConfidence() < minDiscoveryQ ) - System.out.printf("We have a problem at %s: %b %f%n", eval, eval.isSNP(), eval.getVariationConfidence()); - } - - if ( eval != null && eval.isSNP() && eval.getVariationConfidence() >= minDiscoveryQ ) { - //System.out.printf("%s has: %nDBSNP: %s%nEVAL:%s%n", context.getLocation(), dbsnp, eval); - //System.out.printf("N snps %d = %s%n", ++nSNPs, eval.getLocation()); - - updateTransitionTransversion(eval, ref, context); - - if (lastVariant != null) updatePairwiseDistances(eval, lastVariant); - lastVariant = eval; - //updateHapMapRate(dbsnp, eval, ref, context); - } - updateVariantDBCoverage(dbsnp, eval, ref, context); - return 1; } - private void updatePairwiseDistances(AllelicVariant eval, AllelicVariant lastVariant) { - GenomeLoc eL = eval.getLocation(); - GenomeLoc lvL = lastVariant.getLocation(); - if (eL.getContigIndex() == lvL.getContigIndex()) { - long d = eL.distance(lvL); - pairWiseDistances.add(d); - out.printf("# Pairwise-distance %d %s %s%n", d, eL, lvL); + + public void updateAnalysisSet(final String analysisSetName, AllelicVariant eval, + RefMetaDataTracker tracker, char ref, LocusContext context) { + // Iterate over each analysis, and update it + for ( VariantAnalysis analysis : getAnalysisSet(analysisSetName) ) { + String s = analysis.update(eval, tracker, ref, context); + if ( s != null ) analysis.getPrintStream().println(s); } } - private void updateTransitionTransversion(AllelicVariant dbsnp, char ref, LocusContext context) { - char refBase = dbsnp.getRefSnpFWD(); - char altBase = dbsnp.getAltSnpFWD(); - //System.out.printf("%c %c%n", refBase, altBase); - //int i = transition_transversion_bin(dbsnp.getHeterozygosity()); - //System.out.printf("MAF = %f => %d%n", dbsnp.getMAF(), i); - //EnumMap bin = transition_transversion_counts[i]; - - BaseUtils.BaseSubstitutionType subType = BaseUtils.SNPSubstitutionType(refBase, altBase); - Histogram h = subType == BaseUtils.BaseSubstitutionType.TRANSITION ? transitions : transversions; - double het = dbsnp.getHeterozygosity(); - h.setBin(het, h.getBin(het) + 1); - //int sit = bin.get(BaseUtils.BaseSubstitutionType.TRANSITION); - //int ver = bin.get(BaseUtils.BaseSubstitutionType.TRANSVERSION); - //System.out.printf("%s %.2f %s%n", subType, dbsnp.getHeterozygosity(), h.x2bin(logHet), dbsnp.toString()); - - } - - private void updateVariantDBCoverage(AllelicVariant dbsnp, AllelicVariant eval, char ref, LocusContext context) { - // There are four cases here: - dbSNPStats.inc(dbsnp != null, eval != null); - } - // Given result of map function public Integer reduceInit() { return 0; } public Integer reduce(Integer value, Integer sum) { @@ -143,54 +139,24 @@ public class VariantEvalWalker extends RefWalker { } public void onTraversalDone(Integer result) { - int nTransitions = 0; - int nTransversions = 0; - - StringBuilder s = new StringBuilder(); - for ( int i = 0; i < N_TRANSITION_TRANVERSION_BINS; i++ ) { - //double avHet = Math.pow(10, transitions.bin2x(i)); - double avHet = transitions.bin2x(i); - if ( avHet > 0.5 ) break; - - int sit = transitions.getBin(i); - int ver = transversions.getBin(i); - nTransitions += sit; - nTransversions += ver; - double ratio = (float)sit/ver; - s.append(String.format("%.2f %d %d %f%n", avHet, sit, ver, ratio)); + for ( String analysisSetName : ALL_ANALYSIS_NAMES ) { + printAnalysisSet(analysisSetName); } - out.printf("# n bases covered: %d%n", nBasesCovered); - out.printf("# sites: %d%n", nSNPs); - out.printf("# variant rate: %.5f confident variants per base%n", nSNPs / (1.0 * nBasesCovered)); - out.printf("# variant rate: 1 / %d confident variants per base%n", nBasesCovered / nSNPs); - out.printf("# transitions: %d%n", nTransitions); - out.printf("# transversions: %d%n", nTransversions); - out.printf("# ratio: %.2f%n", nTransitions / (1.0 * nTransversions)); + } - // dbSNP stats - out.println(dbSNPStats.toSingleLineString("#")); - out.print(dbSNPStats.toMultiLineString("#")); - - int[] pairCounts = new int[pairWiseBoundries.length]; - Arrays.fill(pairCounts, 0); - for ( long dist : pairWiseDistances ) { - boolean done = false; - for ( int i = 0; i < pairWiseBoundries.length && ! done ; i++ ) { - int maxDist = pairWiseBoundries[i]; - if ( dist < maxDist ) { - pairCounts[i]++; - done = true; - } + private void printAnalysisSet( final String analysisSetName ) { + Date now = new Date(); + for ( VariantAnalysis analysis : getAnalysisSet(analysisSetName) ) { + PrintStream stream = analysis.getPrintStream(); // getAnalysisOutputStream(analysisSetName + "." + analysis.getName(), analysis.getParams()); + stream.printf("%s%s%n", COMMENT_STRING, Utils.dupString('-', 78)); + stream.printf("%sAnalysis set %s%n", COMMENT_STRING, analysisSetName); + stream.printf("%sAnalysis name %s%n", COMMENT_STRING, analysis.getName()); + stream.printf("%sAnalysis params %s%n", COMMENT_STRING, Utils.join(" ", analysis.getParams())); + stream.printf("%sAnalysis class %s%n", COMMENT_STRING, analysis ); + stream.printf("%sAnalysis time %s%n", COMMENT_STRING, now ); + for ( String line : analysis.done()) { + stream.printf("%s %s%n", COMMENT_STRING, line); } } - - out.printf("# snps counted for pairwise distance: %d%n", pairWiseDistances.size()); - for ( int i = 0; i < pairWiseBoundries.length; i++ ) { - int maxDist = pairWiseBoundries[i]; - int count = pairCounts[i]; - out.printf("# snps within %d bp: %d%n", maxDist, count); - } - - out.print(s); } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantMatcher.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantMatcher.java new file mode 100755 index 000000000..3f519459d --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantMatcher.java @@ -0,0 +1,34 @@ +package org.broadinstitute.sting.playground.gatk.walkers.varianteval; + +import org.broadinstitute.sting.gatk.refdata.AllelicVariant; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.LocusContext; + +import java.io.PrintStream; +import java.util.List; +import java.util.Arrays; +import java.util.ArrayList; + +public class VariantMatcher extends BasicVariantAnalysis { + String dbName; + + public VariantMatcher(final String name) { + super("variant_matches"); + dbName = name; + } + + public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) { + String r = null; + AllelicVariant db = (AllelicVariant)tracker.lookup(dbName, null); + + if ( eval != null || db != null ) { + String matchFlag = " "; + if ( eval != null && db != null ) matchFlag = "*** "; + if ( eval == null && db != null ) matchFlag = ">>> "; + if ( eval != null && db == null ) matchFlag = "<<< "; + + r = String.format("%s %s: %s <=> EVAL: %s", dbName, matchFlag, db, eval); + } + return r; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/MathUtils.java b/java/src/org/broadinstitute/sting/utils/MathUtils.java index 90f86c585..3a402c882 100755 --- a/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -84,6 +84,35 @@ public class MathUtils { //return (new cern.jet.random.Binomial(n, p, cern.jet.random.engine.RandomEngine.makeDefault())).pdf(k); } + /** + * Computes a multinomial. This is computed using the formula + * + * M(x1,x2,...,xk; n) = [ n! / (x1! x2! ... xk!) ] + * + * where xi represents the number of times outcome i was observed, n is the number of total observations. + * In this implementation, the value of n is inferred as the sum over i of xi. + * + * @param x an int[] of counts, where each element represents the number of times a certain outcome was observed + * @return the multinomial of the specified configuration. + */ + public static double multinomial(int[] x) { + // In order to avoid overflow in computing large factorials in the multinomial + // coefficient, we split the calculation up into the product of a bunch of + // binomial coefficients. + + double multinomialCoefficient = 1.0; + + for (int i = 0; i < x.length; i++) { + int n = 0; + for (int j = 0; j <= i; j++) { n += x[j]; } + + double multinomialTerm = Arithmetic.binomial(n, x[i]); + multinomialCoefficient *= multinomialTerm; + } + + return multinomialCoefficient; + } + /** * Computes a multinomial probability. This is computed using the formula * @@ -101,16 +130,7 @@ public class MathUtils { // In order to avoid overflow in computing large factorials in the multinomial // coefficient, we split the calculation up into the product of a bunch of // binomial coefficients. - - double multinomialCoefficient = 1.0; - - for (int i = 0; i < x.length; i++) { - int n = 0; - for (int j = 0; j <= i; j++) { n += x[j]; } - - double multinomialTerm = Arithmetic.binomial(n, x[i]); - multinomialCoefficient *= multinomialTerm; - } + double multinomialCoefficient = multinomial(x); double probs = 1.0, totalprob = 0.0; for (int obsCountsIndex = 0; obsCountsIndex < x.length; obsCountsIndex++) { diff --git a/java/src/org/broadinstitute/sting/utils/Utils.java b/java/src/org/broadinstitute/sting/utils/Utils.java index 933a681c0..ed958131c 100755 --- a/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/java/src/org/broadinstitute/sting/utils/Utils.java @@ -43,6 +43,13 @@ public class Utils { throw new RuntimeException(msg); } + public static List cons(final T elt, final List l) { + List l2 = new ArrayList(); + l2.add(elt); + if ( l != null ) l2.addAll(l); + return l2; + } + /** * pretty print the warning message supplied * @param message the message diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/IteratorPoolTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/IteratorPoolTest.java index 4ed02d88b..8ff0cd345 100755 --- a/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/IteratorPoolTest.java +++ b/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/IteratorPoolTest.java @@ -11,6 +11,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.TabularROD; +import org.broadinstitute.sting.gatk.refdata.RODIterator; import java.io.File; import java.io.FileNotFoundException; @@ -56,7 +57,7 @@ public class IteratorPoolTest extends BaseTest { @Test public void testCreateSingleIterator() { IteratorPool iteratorPool = new IteratorPool(rod); - ReferenceOrderedData.RODIterator iterator = (ReferenceOrderedData.RODIterator)iteratorPool.iterator( testSite1 ); + RODIterator iterator = (RODIterator)iteratorPool.iterator( testSite1 ); Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators()); Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators()); @@ -77,10 +78,10 @@ public class IteratorPoolTest extends BaseTest { @Test public void testCreateMultipleIterators() { IteratorPool iteratorPool = new IteratorPool(rod); - ReferenceOrderedData.RODIterator iterator1 = (ReferenceOrderedData.RODIterator)iteratorPool.iterator( testSite1 ); + RODIterator iterator1 = (RODIterator)iteratorPool.iterator( testSite1 ); // Create a new iterator at position 2. - ReferenceOrderedData.RODIterator iterator2 = iteratorPool.iterator( testSite2 ); + RODIterator iterator2 = iteratorPool.iterator( testSite2 ); Assert.assertEquals("Number of iterators in the pool is incorrect", 2, iteratorPool.numIterators()); Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators()); @@ -127,7 +128,7 @@ public class IteratorPoolTest extends BaseTest { @Test public void testIteratorConservation() { IteratorPool iteratorPool = new IteratorPool(rod); - ReferenceOrderedData.RODIterator iterator = (ReferenceOrderedData.RODIterator)iteratorPool.iterator( testSite1 ); + RODIterator iterator = (RODIterator)iteratorPool.iterator( testSite1 ); Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators()); Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators()); @@ -162,7 +163,7 @@ public class IteratorPoolTest extends BaseTest { @Test public void testIteratorCreation() { IteratorPool iteratorPool = new IteratorPool(rod); - ReferenceOrderedData.RODIterator iterator = (ReferenceOrderedData.RODIterator)iteratorPool.iterator( testSite3 ); + RODIterator iterator = (RODIterator)iteratorPool.iterator( testSite3 ); Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators()); Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators()); diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODTest.java index 03211226d..e4df9b23f 100755 --- a/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODTest.java @@ -26,7 +26,7 @@ import java.util.ArrayList; public class TabularRODTest extends BaseTest { private static FastaSequenceFile2 seq; private ReferenceOrderedData ROD; - private ReferenceOrderedData.RODIterator iter; + private RODIterator iter; @BeforeClass @@ -113,7 +113,7 @@ public class TabularRODTest extends BaseTest { public void testDelim1() { File file2 = new File(testDir + "TabularDataTest2.dat"); ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", file2, TabularROD.class); - ReferenceOrderedData.RODIterator iter_commas = ROD_commas.iterator(); + RODIterator iter_commas = ROD_commas.iterator(); logger.warn("Executing testDelim1"); TabularROD one2 = (TabularROD)iter_commas.next(); @@ -130,7 +130,7 @@ public class TabularRODTest extends BaseTest { TabularROD.setDelimiter(",",","); File file2 = new File(testDir + "TabularDataTest2.dat"); ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", file2, TabularROD.class); - ReferenceOrderedData.RODIterator iter_commas = ROD_commas.iterator(); + RODIterator iter_commas = ROD_commas.iterator(); logger.warn("Executing testDelim1"); TabularROD one2 = (TabularROD)iter_commas.next(); @@ -173,7 +173,7 @@ public class TabularRODTest extends BaseTest { out.println(row.toString()); ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", outputFile, TabularROD.class); - ReferenceOrderedData.RODIterator iter_commas = ROD_commas.iterator(); + RODIterator iter_commas = ROD_commas.iterator(); TabularROD one = (TabularROD)iter_commas.next(); assertTrue(one.size() == 3); diff --git a/python/Gelis2PopSNPs.py b/python/Gelis2PopSNPs.py index a83002e72..ab3234430 100755 --- a/python/Gelis2PopSNPs.py +++ b/python/Gelis2PopSNPs.py @@ -63,6 +63,7 @@ def main(): if filter(lambda x: x <> 0, jobids) <> []: # there's still work to do sys.exit('Stopping. Please rerun this program when the farm jobs are complete: ' + str(jobids)) + # TODO: Should add a wait here for all farm jobs to finish... print 'gelis', gelis print 'jobids', jobids else: @@ -76,8 +77,11 @@ def main(): dbsnpFile = geli2dbsnpFile(geli) if not os.path.exists(dbsnpFile): dbsnpCmd = picard_utils.CollectDbSnpMatchesCmd(geli, dbsnpFile, OPTIONS.lod) + if jobid == 0: jobid = None farm_commands.cmd(dbsnpCmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid) + # TODO: Should add a wait here for all farm jobs to finish... + # read in the dbSNP tracks nTotalSnps = 0 nNovelSnps = 0 @@ -99,11 +103,12 @@ def main(): jobid = None variantsOut = map( lambda geli: os.path.split(geli)[1] + '.calls', gelis) for geli, variantOut in zip(gelis, variantsOut): + name = os.path.split(geli)[1] if not os.path.exists(variantOut): - cmd = ("GeliToText.jar I=%s | awk '$7 > %f' > %s" % ( geli, OPTIONS.lod, variantOut) ) + cmd = ("GeliToText.jar I=%s | awk '$1 !~ \"@\" && $1 !~ \"#Sequence\" && $0 !~ \"GeliToText\"' | awk '$7 > %f {print \"%s\" $0}' > %s" % ( geli, OPTIONS.lod, name, variantOut) ) jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry) - cmd = ("cat %s | awk '$1 !~ \"@\" && $1 !~ \"#Sequence\" && $0 !~ \"GeliToText\"' | sort -k 1 -k 2 -n > tmp.calls" % ( ' '.join(variantsOut) ) ) + cmd = ("cat %s | sort -k 1 -k 2 -n > tmp.calls" % ( ' '.join(variantsOut) ) ) jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid) sortedCallFile = 'all.sorted.calls' @@ -113,14 +118,14 @@ def main(): sortedCalls = [line.split() for line in open(sortedCallFile)] aggregratedCalls = picard_utils.aggregateGeliCalls(sortedCalls) - print >> outputFile, 'loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom' + print >> outputFile, 'loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom individuals' for snp in map( lambda x: picard_utils.aggregatedGeliCalls2SNP(x, nIndividuals), aggregratedCalls ): if snp == None: continue # ignore bad calls #print snp #sharedCalls = list(sharedCallsGroup) #genotype = list(sharedCalls[0][5]) - print >> outputFile, '%s %s %s %.6f -420.0 -420.0 0.000000 100.0 %d %d %d %d' % (snp.loc, snp.ref, snp.alt(), snp.q(), nIndividuals, snp.nRefGenotypes(), snp.nHetGenotypes(), snp.nHomVarGenotypes()) + print >> outputFile, '%s %s %s %.6f -420.0 -420.0 0.000000 100.0 %d %d %d %d NA' % (snp.loc, snp.ref, snp.alt(), snp.q(), nIndividuals, snp.nRefGenotypes(), snp.nHetGenotypes(), snp.nHomVarGenotypes()) outputFile.close() diff --git a/python/farm_commands.py b/python/farm_commands.py index 37167b3ef..acdcf7c3f 100644 --- a/python/farm_commands.py +++ b/python/farm_commands.py @@ -24,7 +24,7 @@ def cmd(cmd_str_from_user, farm_queue=False, output_head=None, just_print_comman if waitID <> None: cmd_str += " -w \"done(%s)\"" % (str(waitID)) - cmd_str += " "+cmd_str_from_user + cmd_str += " \""+cmd_str_from_user + "\"" print ">>> Farming via "+cmd_str diff --git a/python/picard_utils.py b/python/picard_utils.py index 2e5070612..cbdaa406d 100755 --- a/python/picard_utils.py +++ b/python/picard_utils.py @@ -55,13 +55,22 @@ def genotypes2allelefrequencies(ref, genotypes, nIndividuals = -1): nAltChroms = nChroms - nRefChroms p = float(nRefChroms) / nChroms q = float(nAltChroms) / nChroms + + #print 'genotypes', genotypes + #print 'alleles', alleles + #print 'nChroms', nChroms + #print 'nCalledChroms', nCalledChroms + #print 'nMissingChroms', nMissingChroms + #print 'nRefChroms', nRefChroms + #print 'nAltChroms', nAltChroms + #print 'p, q', p, q assert p + q == 1 return [p, q, n] class PicardSNP: - def __init__( self, loc, ref, polymorphism, heterozygosity, allelefrequencies, genotypes): + def __init__( self, loc, ref, polymorphism, heterozygosity, allelefrequencies, genotypes, sources): self.loc = loc self.ref = ref self.polymorphism = polymorphism @@ -69,6 +78,7 @@ class PicardSNP: self.nIndividuals = allelefrequencies[2] self.allelefrequencies = allelefrequencies self.genotypes = genotypes + self.sources = sources def refGenotype(self): return self.ref + self.ref @@ -92,7 +102,7 @@ class PicardSNP: return self.allelefrequencies[1] def countGenotype(self, genotype): - r = len(filter( lambda x: x == genotype, self.genotypes )) + r = len(filter( lambda x: sorted(x) == sorted(genotype), self.genotypes )) #print 'countGenotype', genotype, self.genotypes, r return r @@ -105,7 +115,6 @@ class PicardSNP: def nHomVarGenotypes(self): return self.countGenotype(self.homVarGenotype()) - def __str__(self): return '%s %s %s %s %s %s' % ( self.loc, self.ref, str(self.polymorphism), str(self.het()), str(self.allelefrequencies), str(self.genotypes)) @@ -132,7 +141,7 @@ def aggregatedGeliCalls2SNP( geliCallsAtSite, nIndividuals ): #print 'polymorphism', polymorphism genotype = list(geliCallsAtSite[1][0][5]) - return PicardSNP(loc, refBase, polymorphism, genotypes2heterozygosity(genotypes, nIndividuals), genotypes2allelefrequencies(refBase, genotypes, nIndividuals), genotypes) + return PicardSNP(loc, refBase, polymorphism, genotypes2heterozygosity(genotypes, nIndividuals), genotypes2allelefrequencies(refBase, genotypes, nIndividuals), genotypes, []) #return '%s %s %s 0.002747 -411.622578 -420.661738 0.000000 9.039160 364.000000 %d 1 0' % (loc, genotype[0], genotype[1], len(geliCallsAtSite))