From dcb88925689273717e4684d1d40a283d53c935fd Mon Sep 17 00:00:00 2001 From: andrewk Date: Fri, 3 Jul 2009 08:07:02 +0000 Subject: [PATCH] Lot of code for coverage evaluation tools including first version of python script to evaluate the downsampled SSG callls made and the java code to make all the calls at Hapmap chip sites at various downsampling levels; ListUtils contains functions for randomnly subsetting lists (with replacement) which are useful for subsetting the same elements in both the reads and the offsets lists of a LocusWalker git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1162 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/CoverageEvalWalker.java | 82 +++++++++---------- .../broadinstitute/sting/utils/ListUtils.java | 50 +++++++++++ python/CoverageEval.py | 70 ++++++++++++++++ 3 files changed, 159 insertions(+), 43 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/utils/ListUtils.java create mode 100755 python/CoverageEval.py diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java index 5b06b1b40..99d1a1a37 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java @@ -5,12 +5,14 @@ import org.broadinstitute.sting.playground.utils.IndelLikelihood; import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods; import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.rodGFF; import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.*; import java.util.List; +import java.util.ArrayList; import java.io.PrintStream; import java.io.FileNotFoundException; import java.io.File; @@ -22,14 +24,7 @@ import java.io.File; * Time: 12:38:17 PM * To change this template use File | Settings | File Templates. */ -public class CoverageEvalWalker extends LocusWalker { - - // Control how the genotype hypotheses are weighed - @Argument(fullName="priors_any_locus", shortName="plocus", doc="Comma-separated prior likelihoods for any locus (homref,het,homvar)", required=false) public String PRIORS_ANY_LOCUS = "0.999,1e-3,1e-5"; - @Argument(fullName="priors_hapmap", shortName="phapmap", doc="Comma-separated prior likelihoods for Hapmap loci (homref,het,homvar)", required=false) public String PRIORS_HAPMAP = "0.999,1e-3,1e-5"; - @Argument(fullName="priors_dbsnp", shortName="pdbsnp", doc="Comma-separated prior likelihoods for dbSNP loci (homref,het,homvar)", required=false) public String PRIORS_DBSNP = "0.999,1e-3,1e-5"; - @Argument(fullName="priors_2nd_on", shortName="p2ndon", doc="Comma-separated prior likelihoods for the secondary bases of primary on-genotype bases (AA,AC,AG,AT,CC,CG,CT,GG,GT,TT)", required=false) public String PRIORS_2ND_ON = "0.000,0.302,0.366,0.142,0.000,0.548,0.370,0.000,0.319,0.000"; - @Argument(fullName="priors_2nd_off", shortName="p2ndoff", doc="Comma-separated prior likelihoods for the secondary bases of primary off-genotype bases (AA,AC,AG,AT,CC,CG,CT,GG,GT,TT)", required=false) public String PRIORS_2ND_OFF = "0.480,0.769,0.744,0.538,0.575,0.727,0.768,0.589,0.762,0.505"; +public class CoverageEvalWalker extends LocusWalker, String> { // Control what goes into the variants file and what format that file should have @Argument(fullName="lod_threshold", shortName="lod", doc="The lod threshold on which variants should be filtered", required=false) public Double LOD_THRESHOLD = 5.0; @@ -37,12 +32,6 @@ public class CoverageEvalWalker extends LocusWalker map(RefMetaDataTracker tracker, char ref, LocusContext context) { + rodGFF hapmap_chip = (rodGFF)tracker.lookup("hapmap-chip", null); + String hc_genotype; + if (hapmap_chip != null) { + hc_genotype = hapmap_chip.getFeature(); + }else{ + hc_genotype = new String(new char[] {ref, ref}); } - return pdbls; - } - - public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context) { - + //if (tracker.hasROD("hapmap-chip")) { + ArrayList Gs = new ArrayList(); ReadBackedPileup pileup = new ReadBackedPileup(ref, context); String bases = pileup.getBases(); List reads = context.getReads(); List offsets = context.getOffsets(); - // Handle single-base polymorphisms. - GenotypeLikelihoods G = callGenotype(tracker, ref, pileup, reads, offsets); + // Iterate over coverage levels + int coverage_available = reads.size(); + int coverage_levels[] = {4, 10, 20, Integer.MAX_VALUE}; + int downsampling_repeats = 10; // number of times to random re-sample each coverage_level + for (int coverage : coverage_levels) { + coverage = Math.min(coverage_available, coverage); // don't exceed max available coverage + for (int r=0; r subset_indices = ListUtils.randomSubsetIndices(coverage, coverage_available); + List sub_reads = ListUtils.subsetListByIndices(subset_indices, reads); + List sub_offsets = ListUtils.subsetListByIndices(subset_indices, offsets); - return G.toAlleleFrequencyEstimate(context.getLocation(), ref, bases.length(), bases, G.likelihoods, "sample"); + // Call genotypes on subset of reads and offsets + GenotypeLikelihoods G = callGenotype(tracker, ref, pileup, sub_reads, sub_offsets); + String geliString = G.toAlleleFrequencyEstimate(context.getLocation(), ref, bases.length(), bases, G.likelihoods, "sample").asGeliString(); + + Gs.add(hc_genotype+"\t"+coverage+"\t"+coverage_available+"\t"+geliString); + } + } + + return Gs; } /** @@ -106,7 +102,7 @@ public class CoverageEvalWalker extends LocusWalker reads, List offsets) { GenotypeLikelihoods G; - G = new GenotypeLikelihoods(plocus[0], plocus[1], plocus[2], p2ndon, p2ndoff); + G = new GenotypeLikelihoods(); for ( int i = 0; i < reads.size(); i++ ) { SAMRecord read = reads.get(i); @@ -124,11 +120,11 @@ public class CoverageEvalWalker extends LocusWalker= LOD_THRESHOLD) || (alleleFreq.location == a) ) { - String line = GELI_OUTPUT_FORMAT ? alleleFreq.asGeliString() : alleleFreq.asTabularString(); - variantsOut.println(line); + public String reduce(List alleleFreqLines, String sum) { + //GenomeLoc a = GenomeLocParser.parseGenomeLoc("chr1:42971309"); + //if ((alleleFreq != null && alleleFreq.lodVsRef >= LOD_THRESHOLD)) { // || (alleleFreq.location == a) ) { + for (String line : alleleFreqLines) { + variantsOut.println(line); } return ""; diff --git a/java/src/org/broadinstitute/sting/utils/ListUtils.java b/java/src/org/broadinstitute/sting/utils/ListUtils.java new file mode 100644 index 000000000..5074d4eb1 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/ListUtils.java @@ -0,0 +1,50 @@ +package org.broadinstitute.sting.utils; + +import java.util.List; +import java.util.HashSet; +import java.util.ArrayList; +import java.util.Random; + +/** + * Created by IntelliJ IDEA. + * User: andrew + * Date: Jul 2, 2009 + * Time: 3:12:57 PM + * To change this template use File | Settings | File Templates. + */ +public class ListUtils { + + static Random rand = new Random(12321); //System.currentTimeMillis()); + + static public ArrayList randomSubsetIndices(int n, int k) { + // Returns n random indices drawn with replacement from the range 1..k + + /*ArrayList balls = new ArrayList(); + for (int i=0; i chosen_balls = new ArrayList (); + for (int i=0; i ArrayList subsetListByIndices(List indices, List list) { + // Given a list of indices into a list, return those elements of the list list + + ArrayList subset = new ArrayList(); + + for (int i : indices) { + subset.add(list.get(i)); + } + + return subset; + } + + +} diff --git a/python/CoverageEval.py b/python/CoverageEval.py new file mode 100755 index 000000000..82cc9aa8f --- /dev/null +++ b/python/CoverageEval.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python + +import sys + +def chopped_line_generator(filename): + fin = open(filename) + for line in fin: + line = line.rstrip() + yield line + +def subset_list_by_indices(indices, list): + subset = [] + for index in indices: + subset.append(list[index]) + return subset + +def chunk_generator(line_gen, key_fields): + """Input: + line_gen: generator that produces lines with linefeeds chopped off + key_fields: field numbers in each record used to determine chunk membership +Output: + locus_chunk: list of consecutive lines that have the same key_fields +""" + + locus_chunk = [] + last_key = "" + for line in line_gen: + fields = line.split() + key = subset_list_by_indices(key_fields, fields) + if key == last_key: + locus_chunk.append(line) + else: + last_key =key + if locus_chunk != []: + yield locus_chunk + locus_chunk = [] + yield locus_chunk + +def chunk_stats(chunk): + records = 0 + correct_genotype = 0 + for record in chunk: + fields = record.split() + if fields[0] == fields[8]: + correct_genotype += 1 + records += 1 + return float(correct_genotype) / records + +if __name__ == "__main__": + if len(sys.argv) < 2: + sys.exit("Usage: CoverageEval.py geli_file") + filename = sys.argv[1] + + fin = open(filename) + locus_gen = chunk_generator(chopped_line_generator(filename), (3,4)) + print "Fraction correct genotype\tCoverage sampled\tLocus\tReference base\tHapmap chip genotype (Max. coverage genotype call for reference calls)" + for locus in locus_gen: + #print "NEW LOCUS" + covs = dict() + coverage_chunk_gen = chunk_generator(locus, (1,3,4)) + for cov_chunk in coverage_chunk_gen: + #print "NEW COVERAGE" + #print "\n".join(cov_chunk) + fields = cov_chunk[0].split() + coverage = fields[1] + print "\t".join(map(str,("%.2f"%chunk_stats(cov_chunk), coverage, fields[3]+":"+fields[4],fields[5],fields[0]))) + + #covs[coverage] = cov_chunk + +