diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java index 5b06b1b40..99d1a1a37 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java @@ -5,12 +5,14 @@ import org.broadinstitute.sting.playground.utils.IndelLikelihood; import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods; import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.rodGFF; import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.*; import java.util.List; +import java.util.ArrayList; import java.io.PrintStream; import java.io.FileNotFoundException; import java.io.File; @@ -22,14 +24,7 @@ import java.io.File; * Time: 12:38:17 PM * To change this template use File | Settings | File Templates. */ -public class CoverageEvalWalker extends LocusWalker { - - // Control how the genotype hypotheses are weighed - @Argument(fullName="priors_any_locus", shortName="plocus", doc="Comma-separated prior likelihoods for any locus (homref,het,homvar)", required=false) public String PRIORS_ANY_LOCUS = "0.999,1e-3,1e-5"; - @Argument(fullName="priors_hapmap", shortName="phapmap", doc="Comma-separated prior likelihoods for Hapmap loci (homref,het,homvar)", required=false) public String PRIORS_HAPMAP = "0.999,1e-3,1e-5"; - @Argument(fullName="priors_dbsnp", shortName="pdbsnp", doc="Comma-separated prior likelihoods for dbSNP loci (homref,het,homvar)", required=false) public String PRIORS_DBSNP = "0.999,1e-3,1e-5"; - @Argument(fullName="priors_2nd_on", shortName="p2ndon", doc="Comma-separated prior likelihoods for the secondary bases of primary on-genotype bases (AA,AC,AG,AT,CC,CG,CT,GG,GT,TT)", required=false) public String PRIORS_2ND_ON = "0.000,0.302,0.366,0.142,0.000,0.548,0.370,0.000,0.319,0.000"; - @Argument(fullName="priors_2nd_off", shortName="p2ndoff", doc="Comma-separated prior likelihoods for the secondary bases of primary off-genotype bases (AA,AC,AG,AT,CC,CG,CT,GG,GT,TT)", required=false) public String PRIORS_2ND_OFF = "0.480,0.769,0.744,0.538,0.575,0.727,0.768,0.589,0.762,0.505"; +public class CoverageEvalWalker extends LocusWalker, String> { // Control what goes into the variants file and what format that file should have @Argument(fullName="lod_threshold", shortName="lod", doc="The lod threshold on which variants should be filtered", required=false) public Double LOD_THRESHOLD = 5.0; @@ -37,12 +32,6 @@ public class CoverageEvalWalker extends LocusWalker map(RefMetaDataTracker tracker, char ref, LocusContext context) { + rodGFF hapmap_chip = (rodGFF)tracker.lookup("hapmap-chip", null); + String hc_genotype; + if (hapmap_chip != null) { + hc_genotype = hapmap_chip.getFeature(); + }else{ + hc_genotype = new String(new char[] {ref, ref}); } - return pdbls; - } - - public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context) { - + //if (tracker.hasROD("hapmap-chip")) { + ArrayList Gs = new ArrayList(); ReadBackedPileup pileup = new ReadBackedPileup(ref, context); String bases = pileup.getBases(); List reads = context.getReads(); List offsets = context.getOffsets(); - // Handle single-base polymorphisms. - GenotypeLikelihoods G = callGenotype(tracker, ref, pileup, reads, offsets); + // Iterate over coverage levels + int coverage_available = reads.size(); + int coverage_levels[] = {4, 10, 20, Integer.MAX_VALUE}; + int downsampling_repeats = 10; // number of times to random re-sample each coverage_level + for (int coverage : coverage_levels) { + coverage = Math.min(coverage_available, coverage); // don't exceed max available coverage + for (int r=0; r subset_indices = ListUtils.randomSubsetIndices(coverage, coverage_available); + List sub_reads = ListUtils.subsetListByIndices(subset_indices, reads); + List sub_offsets = ListUtils.subsetListByIndices(subset_indices, offsets); - return G.toAlleleFrequencyEstimate(context.getLocation(), ref, bases.length(), bases, G.likelihoods, "sample"); + // Call genotypes on subset of reads and offsets + GenotypeLikelihoods G = callGenotype(tracker, ref, pileup, sub_reads, sub_offsets); + String geliString = G.toAlleleFrequencyEstimate(context.getLocation(), ref, bases.length(), bases, G.likelihoods, "sample").asGeliString(); + + Gs.add(hc_genotype+"\t"+coverage+"\t"+coverage_available+"\t"+geliString); + } + } + + return Gs; } /** @@ -106,7 +102,7 @@ public class CoverageEvalWalker extends LocusWalker reads, List offsets) { GenotypeLikelihoods G; - G = new GenotypeLikelihoods(plocus[0], plocus[1], plocus[2], p2ndon, p2ndoff); + G = new GenotypeLikelihoods(); for ( int i = 0; i < reads.size(); i++ ) { SAMRecord read = reads.get(i); @@ -124,11 +120,11 @@ public class CoverageEvalWalker extends LocusWalker= LOD_THRESHOLD) || (alleleFreq.location == a) ) { - String line = GELI_OUTPUT_FORMAT ? alleleFreq.asGeliString() : alleleFreq.asTabularString(); - variantsOut.println(line); + public String reduce(List alleleFreqLines, String sum) { + //GenomeLoc a = GenomeLocParser.parseGenomeLoc("chr1:42971309"); + //if ((alleleFreq != null && alleleFreq.lodVsRef >= LOD_THRESHOLD)) { // || (alleleFreq.location == a) ) { + for (String line : alleleFreqLines) { + variantsOut.println(line); } return ""; diff --git a/java/src/org/broadinstitute/sting/utils/ListUtils.java b/java/src/org/broadinstitute/sting/utils/ListUtils.java new file mode 100644 index 000000000..5074d4eb1 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/ListUtils.java @@ -0,0 +1,50 @@ +package org.broadinstitute.sting.utils; + +import java.util.List; +import java.util.HashSet; +import java.util.ArrayList; +import java.util.Random; + +/** + * Created by IntelliJ IDEA. + * User: andrew + * Date: Jul 2, 2009 + * Time: 3:12:57 PM + * To change this template use File | Settings | File Templates. + */ +public class ListUtils { + + static Random rand = new Random(12321); //System.currentTimeMillis()); + + static public ArrayList randomSubsetIndices(int n, int k) { + // Returns n random indices drawn with replacement from the range 1..k + + /*ArrayList balls = new ArrayList(); + for (int i=0; i chosen_balls = new ArrayList (); + for (int i=0; i ArrayList subsetListByIndices(List indices, List list) { + // Given a list of indices into a list, return those elements of the list list + + ArrayList subset = new ArrayList(); + + for (int i : indices) { + subset.add(list.get(i)); + } + + return subset; + } + + +} diff --git a/python/CoverageEval.py b/python/CoverageEval.py new file mode 100755 index 000000000..82cc9aa8f --- /dev/null +++ b/python/CoverageEval.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python + +import sys + +def chopped_line_generator(filename): + fin = open(filename) + for line in fin: + line = line.rstrip() + yield line + +def subset_list_by_indices(indices, list): + subset = [] + for index in indices: + subset.append(list[index]) + return subset + +def chunk_generator(line_gen, key_fields): + """Input: + line_gen: generator that produces lines with linefeeds chopped off + key_fields: field numbers in each record used to determine chunk membership +Output: + locus_chunk: list of consecutive lines that have the same key_fields +""" + + locus_chunk = [] + last_key = "" + for line in line_gen: + fields = line.split() + key = subset_list_by_indices(key_fields, fields) + if key == last_key: + locus_chunk.append(line) + else: + last_key =key + if locus_chunk != []: + yield locus_chunk + locus_chunk = [] + yield locus_chunk + +def chunk_stats(chunk): + records = 0 + correct_genotype = 0 + for record in chunk: + fields = record.split() + if fields[0] == fields[8]: + correct_genotype += 1 + records += 1 + return float(correct_genotype) / records + +if __name__ == "__main__": + if len(sys.argv) < 2: + sys.exit("Usage: CoverageEval.py geli_file") + filename = sys.argv[1] + + fin = open(filename) + locus_gen = chunk_generator(chopped_line_generator(filename), (3,4)) + print "Fraction correct genotype\tCoverage sampled\tLocus\tReference base\tHapmap chip genotype (Max. coverage genotype call for reference calls)" + for locus in locus_gen: + #print "NEW LOCUS" + covs = dict() + coverage_chunk_gen = chunk_generator(locus, (1,3,4)) + for cov_chunk in coverage_chunk_gen: + #print "NEW COVERAGE" + #print "\n".join(cov_chunk) + fields = cov_chunk[0].split() + coverage = fields[1] + print "\t".join(map(str,("%.2f"%chunk_stats(cov_chunk), coverage, fields[3]+":"+fields[4],fields[5],fields[0]))) + + #covs[coverage] = cov_chunk + +