diff --git a/java/src/org/broadinstitute/sting/gatk/LocusContext.java b/java/src/org/broadinstitute/sting/gatk/LocusContext.java index e4aa2829e..72e8dd1bb 100755 --- a/java/src/org/broadinstitute/sting/gatk/LocusContext.java +++ b/java/src/org/broadinstitute/sting/gatk/LocusContext.java @@ -34,10 +34,10 @@ public class LocusContext { * @param offsets */ public LocusContext(GenomeLoc loc, List reads, List offsets) { - assert loc != null; - assert loc.getContig() != null; - assert reads != null; - assert offsets != null; + //assert loc != null; + //assert loc.getContig() != null; + //assert reads != null; + //assert offsets != null; this.loc = loc; this.reads = reads; diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/AllelicVariant.java b/java/src/org/broadinstitute/sting/gatk/refdata/AllelicVariant.java index 62e526de7..4671848d0 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/AllelicVariant.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/AllelicVariant.java @@ -95,6 +95,12 @@ public interface AllelicVariant extends Comparable { */ double getMAF() ; + /** Returns heterozygosity, a more accessible general feature of a variant. + * + * @return + */ + double getHeterozygosity() ; + /** Is this variant an actual genotype (such as individual call from sequencing, HapMap chip etc), or * population allelic variant (call from pooled sequencing, dbSNP site etc). Only if variant is a genotype, there * is a meaningful question of, e.g., whether it is a het, or homogeneous non-ref. diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/PooledEMSNPROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/PooledEMSNPROD.java new file mode 100755 index 000000000..9230d1b01 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/PooledEMSNPROD.java @@ -0,0 +1,48 @@ +package org.broadinstitute.sting.gatk.refdata; + +import java.util.*; +import java.util.regex.MatchResult; +import java.util.regex.Pattern; +import java.util.regex.Matcher; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; + +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.xReadLines; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.apache.log4j.Logger; + +/** + * loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom + * chr1:1104840 A N 0.000000 -85.341265 -85.341265 0.000000 0.000000 324.000000 162 0 0 + * chr1:1104841 C N 0.000000 -69.937928 -69.937928 0.000000 0.000000 324.000000 162 0 0 + * chr1:1104842 A N 0.000000 -84.816002 -84.816002 0.000000 0.000000 324.000000 162 0 0 + * + */ +public class PooledEMSNPROD extends TabularROD implements AllelicVariant { + public PooledEMSNPROD(final String name) { + super(name); + } + + //GenomeLoc getLocation(); + public String getRefBasesFWD() { return this.get("ref"); } + public char getRefSnpFWD() throws IllegalStateException { return getRefBasesFWD().charAt(0); } + public String getAltBasesFWD() { return this.get("alt"); } + public char getAltSnpFWD() throws IllegalStateException { return getAltBasesFWD().charAt(0); } + public boolean isReference() { return getVariationConfidence() < 0.01; } + public boolean isSNP() { return ! isReference(); } + public boolean isInsertion() { return false; } + public boolean isDeletion() { return false; } + public boolean isIndel() { return false; } + public double getMAF() { return Double.parseDouble(this.get("EM_alt_freq")); } + public double getHeterozygosity() { return 2 * getMAF() * (1 - getMAF()); } + public boolean isGenotype() { return false; } + public double getVariationConfidence() { return Double.parseDouble(this.get("discovery_lod")) * 10; } + public double getConsensusConfidence() { return -1; } + public List getGenotype() throws IllegalStateException { throw new IllegalStateException(); } + public int getPloidy() throws IllegalStateException { return 2; } + public boolean isBiallelic() { return true; } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java index 0f581eb49..d84dd7975 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -63,6 +63,7 @@ public class RefMetaDataTracker { */ private final String canonicalName(final String name) { + //return name; // .toLowerCase(); return name.toLowerCase(); } @@ -93,7 +94,7 @@ public class RefMetaDataTracker { * @param rod */ public void bind(final String name, ReferenceOrderedDatum rod) { - //logger.debug(String.format("Binding %s to %s%n", name, rod)); + logger.debug(String.format("Binding %s to %s%n", name, rod)); map.put(canonicalName(name), rod); } } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java index aa15fe85d..8b1e917fa 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java @@ -64,6 +64,7 @@ public class ReferenceOrderedData implements addModule("SAMPileup", rodSAMPileup.class); addModule("RefSeq", rodRefSeq.class); addModule("Table", TabularROD.class); + addModule("PooledEM", PooledEMSNPROD.class); } @@ -80,24 +81,28 @@ public class ReferenceOrderedData implements public static void parseBindings(Logger logger, ArrayList bindings, List > rods) { // Loop over triplets - for( String binding: bindings ) { - String[] bindingTokens = binding.split(","); - logger.info("Processing ROD bindings: " + bindings.size() + " -> " + Utils.join(" : ", bindingTokens)); - if( bindingTokens.length != 3 ) + System.out.printf("Binding is %s%n", Utils.join(" XXX ", bindings)); + for( String bindingSets: bindings ) { + String[] bindingTokens = bindingSets.split(","); + if( bindingTokens.length % 3 != 0 ) Utils.scareUser(String.format("Invalid ROD specification: requires triplets of ,, but got %s", Utils.join(",", bindings))); - final String name = bindingTokens[0]; - final String typeName = bindingTokens[1]; - final String fileName = bindingTokens[2]; + for ( int bindingSet = 0; bindingSet < bindingTokens.length; bindingSet += 3 ) { + logger.info("Processing ROD bindings: " + bindings.size() + " -> " + Utils.join(" : ", bindingTokens)); - ReferenceOrderedData rod = parse1Binding(logger, name, typeName, fileName); + final String name = bindingTokens[bindingSet]; + final String typeName = bindingTokens[bindingSet + 1]; + final String fileName = bindingTokens[bindingSet + 2]; - // check that we're not generating duplicate bindings - for ( ReferenceOrderedData rod2 : rods ) - if ( rod2.getName().equals(rod.getName()) ) - Utils.scareUser(String.format("Found duplicate rod bindings", rod.getName())); + ReferenceOrderedData rod = parse1Binding(logger, name, typeName, fileName); - rods.add(rod); + // check that we're not generating duplicate bindings + for ( ReferenceOrderedData rod2 : rods ) + if ( rod2.getName().equals(rod.getName()) ) + Utils.scareUser(String.format("Found duplicate rod bindings", rod.getName())); + + rods.add(rod); + } } } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java index cb247d795..fddc566c2 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java @@ -52,7 +52,7 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map ", parts)); @@ -196,7 +197,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements AllelicVaria @Override public double getConsensusConfidence() { // TODO Auto-generated method stub - return 0; + return Double.MAX_VALUE; } @Override @@ -205,12 +206,16 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements AllelicVaria return null; } - @Override public double getMAF() { // Fixme: update to actually get MAF - return avHet; + //return avHet; + return -1; } + public double getHeterozygosity() { + return avHet; + } + @Override public int getPloidy() throws IllegalStateException { // TODO Auto-generated method stub @@ -220,7 +225,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements AllelicVaria @Override public double getVariationConfidence() { // TODO Auto-generated method stub - return 100; + return Double.MAX_VALUE; } @Override diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/Histogram.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/Histogram.java new file mode 100755 index 000000000..e78ec6a5e --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/Histogram.java @@ -0,0 +1,55 @@ +package org.broadinstitute.sting.playground.gatk.walkers.varianteval; + +import java.util.ArrayList; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: May 27, 2009 + * Time: 2:37:56 PM + * To change this template use File | Settings | File Templates. + */ +public class Histogram { + ArrayList data; + int nBins = 100; + double minValue, maxValue; + + public Histogram(int nBins, double minValue, double maxValue, T initialValue) { + data = new ArrayList(nBins); + this.nBins = nBins; + this.minValue = minValue; + this.maxValue = maxValue; + initialize(initialValue); + } + + public void initialize(T initialValue) { + data.clear(); + for ( int i = 0; i < nBins; i++ ) { + data.add(i, initialValue); + } + } + + public void setBin(int i, T value) { + data.set(i, value); + } + + public void setBin(double x, T value) { + setBin(x2bin(x), value); + } + + public T getBin(int binI) { + return data.get(binI); + } + + public T getBin(double x) { + return getBin(x2bin(x)); + } + + public int x2bin(double x) { + return (int)Math.floor((x - minValue) / ((maxValue - minValue) * nBins)); + } + + public double bin2x(int bin) { + return (maxValue - minValue) * ((bin + 0.5) / nBins) + minValue; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantDBCoverage.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantDBCoverage.java new file mode 100755 index 000000000..22eae9e42 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantDBCoverage.java @@ -0,0 +1,63 @@ +package org.broadinstitute.sting.playground.gatk.walkers.varianteval; + +public class VariantDBCoverage { + private String dbName; + private int nDBObs = 0; + private int nEvalObs = 0; + private int nOverlapping = 0; + + public VariantDBCoverage(final String name) { + dbName = name; + } + + public void inc(boolean inDB, boolean inEval) { + if (inDB) nDBObs++; + if (inEval) nEvalObs++; + if (inDB && inEval) nOverlapping++; + } + + public int nDBSites() { + return nDBObs; + } + + public int nEvalSites() { + return nEvalObs; + } + + public int nOverlappingSites() { + return nOverlapping; + } + + /** + * What fraction of the evaluated site variants were also found in the db? + * + * @return + */ + public double fractionEvalSitesCoveredByDB() { + return nOverlappingSites() / (1.0 * nEvalSites()); + } + + /** + * What fraction of the DB sites were discovered in the evalution calls? + * + * @return + */ + public double fractionDBSitesDiscoveredInEval() { + return nOverlappingSites() / (1.0 * nDBSites()); + } + + public String toSingleLineString(final String prefix) { + return String.format("%s %d\t%d\t%d\t%.2f\t%.2f", prefix, nDBSites(), nEvalSites(), nOverlappingSites(), fractionEvalSitesCoveredByDB(), fractionDBSitesDiscoveredInEval()); + } + + public String toMultiLineString(final String prefix) { + StringBuilder s = new StringBuilder(); + s.append(String.format("%s name %s%n", prefix, dbName)); + s.append(String.format("%s n_db_sites %d%n", prefix, nDBSites())); + s.append(String.format("%s n_eval_sites %d%n", prefix, nEvalSites())); + s.append(String.format("%s n_overlapping_sites %d%n", prefix, nOverlappingSites())); + s.append(String.format("%s per_eval_sites_in_db %.2f%n", prefix, 100*fractionEvalSitesCoveredByDB())); + s.append(String.format("%s per_db_sites_in_eval %.2f%n", prefix, 100*fractionDBSitesDiscoveredInEval())); + return s.toString(); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java index 65aa34e68..99a646bf2 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java @@ -2,10 +2,10 @@ package org.broadinstitute.sting.playground.gatk.walkers.varianteval; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.rodDbSNP; import org.broadinstitute.sting.gatk.refdata.AllelicVariant; import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.cmdLine.Argument; import java.util.EnumMap; @@ -20,49 +20,75 @@ import java.util.EnumMap; @Requires(DataSource.REFERENCE) @Allows(DataSource.REFERENCE) public class VariantEvalWalker extends RefWalker { - int N_TRANSITION_TRANVERSION_BINS = 100; - EnumMap transition_transversion_counts[]; + @Argument(shortName="minDiscoveryQ", doc="Phred-scaled minimum LOD to consider an evaluation SNP a variant", required=false) + public int minDiscoveryQ = -1; + int nBasesCovered = 0; + VariantDBCoverage dbSNPStats = new VariantDBCoverage("dbSNP"); + + int N_TRANSITION_TRANVERSION_BINS = 20; + Histogram transitions; + Histogram transversions; + //EnumMap transition_transversion_counts[]; public void initialize() { - transition_transversion_counts = new EnumMap[N_TRANSITION_TRANVERSION_BINS]; - - for ( int i = 0; i < N_TRANSITION_TRANVERSION_BINS; i++ ) { - transition_transversion_counts[i] = new EnumMap(BaseUtils.BaseSubstitutionType.class); - for ( BaseUtils.BaseSubstitutionType t : BaseUtils.BaseSubstitutionType.values() ) { - transition_transversion_counts[i].put(t, 0); - } - } + //transitions = new Histogram(N_TRANSITION_TRANVERSION_BINS, Math.log10(1e-10), 0.0, 0); + //transversions = new Histogram(N_TRANSITION_TRANVERSION_BINS, Math.log10(1e-10), 0.0, 0); + transitions = new Histogram(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0); + transversions = new Histogram(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0); } - private int transition_transversion_bin(double MAF) { - return (int)Math.floor(MAF * N_TRANSITION_TRANVERSION_BINS); - } + //private int transition_transversion_bin(double het) { + // return (int)Math.floor(het * N_TRANSITION_TRANVERSION_BINS); + //} - private double transition_transversion_bin2MAF(int bin) { - return (bin + 0.5) / N_TRANSITION_TRANVERSION_BINS; - } + //private double transition_transversion_bin2het(int bin) { + // return (bin + 0.5) / N_TRANSITION_TRANVERSION_BINS; + //} public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) { + nBasesCovered++; + AllelicVariant dbsnp = (AllelicVariant)tracker.lookup("dbSNP", null); + AllelicVariant eval = (AllelicVariant)tracker.lookup("eval", null); - if ( dbsnp != null && dbsnp.isSNP() ) { - char refBase = dbsnp.getRefSnpFWD(); - char altBase = dbsnp.getAltSnpFWD(); - //System.out.printf("%c %c%n", refBase, altBase); - int i = transition_transversion_bin(dbsnp.getMAF()); - //System.out.printf("MAF = %f => %d%n", dbsnp.getMAF(), i); - EnumMap bin = transition_transversion_counts[i]; + //if ( eval != null || dbsnp != null ) + // System.out.printf("%s has: %nDBSNP: %s%nEVAL:%s%n", context.getLocation(), dbsnp, eval); - BaseUtils.BaseSubstitutionType subType = BaseUtils.SNPSubstitutionType(refBase, altBase); - bin.put(subType, bin.get(subType) + 1); - int sit = bin.get(BaseUtils.BaseSubstitutionType.TRANSITION); - int ver = bin.get(BaseUtils.BaseSubstitutionType.TRANSVERSION); - out.printf("%s %d %d %.2f %s%n", subType, sit, ver, (float)sit/ver, dbsnp.toString()); + if ( eval != null && eval.isSNP() && eval.getVariationConfidence() >= minDiscoveryQ ) { + //System.out.printf("%s has: %nDBSNP: %s%nEVAL:%s%n", context.getLocation(), dbsnp, eval); + + updateTransitionTransversion(eval, ref, context); + + //updateHapMapRate(dbsnp, eval, ref, context); } + updateVariantDBCoverage(dbsnp, eval, ref, context); return 1; } + private void updateTransitionTransversion(AllelicVariant dbsnp, char ref, LocusContext context) { + char refBase = dbsnp.getRefSnpFWD(); + char altBase = dbsnp.getAltSnpFWD(); + //System.out.printf("%c %c%n", refBase, altBase); + //int i = transition_transversion_bin(dbsnp.getHeterozygosity()); + //System.out.printf("MAF = %f => %d%n", dbsnp.getMAF(), i); + //EnumMap bin = transition_transversion_counts[i]; + + BaseUtils.BaseSubstitutionType subType = BaseUtils.SNPSubstitutionType(refBase, altBase); + Histogram h = subType == BaseUtils.BaseSubstitutionType.TRANSITION ? transitions : transversions; + double het = dbsnp.getHeterozygosity(); + h.setBin(het, h.getBin(het) + 1); + //int sit = bin.get(BaseUtils.BaseSubstitutionType.TRANSITION); + //int ver = bin.get(BaseUtils.BaseSubstitutionType.TRANSVERSION); + //System.out.printf("%s %.2f %s%n", subType, dbsnp.getHeterozygosity(), h.x2bin(logHet), dbsnp.toString()); + + } + + private void updateVariantDBCoverage(AllelicVariant dbsnp, AllelicVariant eval, char ref, LocusContext context) { + // There are four cases here: + dbSNPStats.inc(dbsnp != null, eval != null); + } + // Given result of map function public Integer reduceInit() { return 0; } public Integer reduce(Integer value, Integer sum) { @@ -73,13 +99,36 @@ public class VariantEvalWalker extends RefWalker { } public void onTraversalDone(Integer result) { + int nTransitions = 0; + int nTransversions = 0; + + StringBuilder s = new StringBuilder(); for ( int i = 0; i < N_TRANSITION_TRANVERSION_BINS; i++ ) { - double averageMAF = transition_transversion_bin2MAF(i); - EnumMap bin = transition_transversion_counts[i]; - int sit = bin.get(BaseUtils.BaseSubstitutionType.TRANSITION); - int ver = bin.get(BaseUtils.BaseSubstitutionType.TRANSVERSION); + //double avHet = Math.pow(10, transitions.bin2x(i)); + double avHet = transitions.bin2x(i); + if ( avHet > 0.5 ) break; + + int sit = transitions.getBin(i); + int ver = transversions.getBin(i); + nTransitions += sit; + nTransversions += ver; double ratio = (float)sit/ver; - out.printf("%.2f %d %d %f%n", averageMAF, sit, ver, ratio); + s.append(String.format("%.2f %d %d %f%n", avHet, sit, ver, ratio)); } + long nSites = nTransitions + nTransversions; + + out.printf("# n bases covered: %d%n", nBasesCovered); + out.printf("# sites: %d%n", nSites); + out.printf("# variant rate: %.5f confident variants per base%n", nSites / (1.0 * nBasesCovered)); + out.printf("# variant rate: 1 / %d confident variants per base%n", nBasesCovered / nSites); + out.printf("# transitions: %d%n", nTransitions); + out.printf("# transversions: %d%n", nTransversions); + out.printf("# ratio: %.2f%n", nTransitions / (1.0 * nTransversions)); + + // dbSNP stats + out.println(dbSNPStats.toSingleLineString("#")); + out.print(dbSNPStats.toMultiLineString("#")); + + out.print(s); } -} \ No newline at end of file +}