Lot of code for coverage evaluation tools including first version of python script to evaluate the downsampled SSG callls made and the java code to make all the calls at Hapmap chip sites at various downsampling levels; ListUtils contains functions for randomnly subsetting lists (with replacement) which are useful for subsetting the same elements in both the reads and the offsets lists of a LocusWalker

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1162 348d0f76-0448-11de-a6fe-93d51630548a
2009-07-03 08:07:02 +00:00 · 2009-07-03 08:07:02 +00:00 · dcb8892568
parent d0cef5ff9d
commit dcb8892568
3 changed files with 159 additions and 43 deletions
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java
@ -5,12 +5,14 @@ import org.broadinstitute.sting.playground.utils.IndelLikelihood;
 import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods;
 import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.refdata.rodGFF;
 import org.broadinstitute.sting.gatk.LocusContext;
 import org.broadinstitute.sting.gatk.walkers.LocusWalker;
 import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.utils.*;

 import java.util.List;
+import java.util.ArrayList;
 import java.io.PrintStream;
 import java.io.FileNotFoundException;
 import java.io.File;
@ -22,14 +24,7 @@ import java.io.File;
 * Time: 12:38:17 PM
 * To change this template use File | Settings | File Templates.
 */
-public class CoverageEvalWalker extends LocusWalker<AlleleFrequencyEstimate, String> {
-
-    // Control how the genotype hypotheses are weighed
-    @Argument(fullName="priors_any_locus", shortName="plocus", doc="Comma-separated prior likelihoods for any locus (homref,het,homvar)", required=false) public String PRIORS_ANY_LOCUS = "0.999,1e-3,1e-5";
-    @Argument(fullName="priors_hapmap", shortName="phapmap", doc="Comma-separated prior likelihoods for Hapmap loci (homref,het,homvar)", required=false) public String PRIORS_HAPMAP = "0.999,1e-3,1e-5";
-    @Argument(fullName="priors_dbsnp", shortName="pdbsnp", doc="Comma-separated prior likelihoods for dbSNP loci (homref,het,homvar)", required=false) public String PRIORS_DBSNP = "0.999,1e-3,1e-5";
-    @Argument(fullName="priors_2nd_on", shortName="p2ndon", doc="Comma-separated prior likelihoods for the secondary bases of primary on-genotype bases (AA,AC,AG,AT,CC,CG,CT,GG,GT,TT)", required=false) public String PRIORS_2ND_ON = "0.000,0.302,0.366,0.142,0.000,0.548,0.370,0.000,0.319,0.000";
-    @Argument(fullName="priors_2nd_off", shortName="p2ndoff", doc="Comma-separated prior likelihoods for the secondary bases of primary off-genotype bases (AA,AC,AG,AT,CC,CG,CT,GG,GT,TT)", required=false) public String PRIORS_2ND_OFF = "0.480,0.769,0.744,0.538,0.575,0.727,0.768,0.589,0.762,0.505";
+public class CoverageEvalWalker extends LocusWalker<List<String>, String> {

    // Control what goes into the variants file and what format that file should have
    @Argument(fullName="lod_threshold", shortName="lod", doc="The lod threshold on which variants should be filtered", required=false) public Double LOD_THRESHOLD = 5.0;
@ -37,12 +32,6 @@ public class CoverageEvalWalker extends LocusWalker<AlleleFrequencyEstimate, Str

    @Argument(fullName="variants_out", shortName="varout", doc="File to which variants should be written", required=true) public File VARIANTS_FILE;

-    public double[] plocus;
-    public double[] phapmap;
-    public double[] pdbsnp;
-    public double[] p2ndon;
-    public double[] p2ndoff;
-
    public PrintStream variantsOut;

    public void initialize() {
@ -54,43 +43,50 @@ public class CoverageEvalWalker extends LocusWalker<AlleleFrequencyEstimate, Str
        }
                
        String header = GELI_OUTPUT_FORMAT ? AlleleFrequencyEstimate.geliHeaderString() : AlleleFrequencyEstimate.asTabularStringHeader();
-        variantsOut.println(header);
-
-        plocus = priorsArray(PRIORS_ANY_LOCUS);
-        phapmap = priorsArray(PRIORS_HAPMAP);
-        pdbsnp = priorsArray(PRIORS_DBSNP);
-        p2ndon = priorsArray(PRIORS_2ND_ON);
-        p2ndoff = priorsArray(PRIORS_2ND_OFF);
-
-    }    
+        variantsOut.println("#DownsampledCoverage\tAvailableCoveragt \t"+header);
+    }

    public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) {
        return (BaseUtils.simpleBaseToBaseIndex(ref) != -1 && context.getReads().size() != 0);
    }

-    private double[] priorsArray(String priorsString) {
-        String[] pstrs = priorsString.split(",");
-        double[] pdbls = new double[pstrs.length];
-
-        for (int i = 0; i < pstrs.length; i++) {
-            pdbls[i] = Double.valueOf(pstrs[i]);
+    public List<String> map(RefMetaDataTracker tracker, char ref, LocusContext context) {
+        rodGFF hapmap_chip = (rodGFF)tracker.lookup("hapmap-chip", null);
+        String hc_genotype;
+        if (hapmap_chip != null) {
+            hc_genotype = hapmap_chip.getFeature();
+        }else{
+            hc_genotype = new String(new char[] {ref, ref});
        }

-        return pdbls;
-    }
-
-    public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context) {
-    
+        //if (tracker.hasROD("hapmap-chip")) {
+        ArrayList<String> Gs = new ArrayList<String>();

        ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
        String bases = pileup.getBases();
        List<SAMRecord> reads = context.getReads();
        List<Integer> offsets = context.getOffsets();

-        // Handle single-base polymorphisms.
-        GenotypeLikelihoods G = callGenotype(tracker, ref, pileup, reads, offsets);
+        // Iterate over coverage levels
+        int coverage_available = reads.size();
+        int coverage_levels[] = {4, 10, 20, Integer.MAX_VALUE};
+        int downsampling_repeats = 10; // number of times to random re-sample each coverage_level
+        for (int coverage : coverage_levels) {
+            coverage = Math.min(coverage_available, coverage); // don't exceed max available coverage
+            for (int r=0; r<downsampling_repeats; r++) {
+                List<Integer> subset_indices = ListUtils.randomSubsetIndices(coverage, coverage_available);
+                List<SAMRecord> sub_reads = ListUtils.subsetListByIndices(subset_indices, reads);
+                List<Integer> sub_offsets = ListUtils.subsetListByIndices(subset_indices, offsets);

-        return G.toAlleleFrequencyEstimate(context.getLocation(), ref, bases.length(), bases, G.likelihoods, "sample");
+                // Call genotypes on subset of reads and offsets
+                GenotypeLikelihoods G = callGenotype(tracker, ref, pileup, sub_reads, sub_offsets);
+                String geliString = G.toAlleleFrequencyEstimate(context.getLocation(), ref, bases.length(), bases, G.likelihoods, "sample").asGeliString();
+
+                Gs.add(hc_genotype+"\t"+coverage+"\t"+coverage_available+"\t"+geliString);
+            }
+        }
+
+        return Gs;
    }

    /**
@ -106,7 +102,7 @@ public class CoverageEvalWalker extends LocusWalker<AlleleFrequencyEstimate, Str
    private GenotypeLikelihoods callGenotype(RefMetaDataTracker tracker, char ref, ReadBackedPileup pileup, List<SAMRecord> reads, List<Integer> offsets) {
        GenotypeLikelihoods G;

-        G = new GenotypeLikelihoods(plocus[0], plocus[1], plocus[2], p2ndon, p2ndoff);
+        G = new GenotypeLikelihoods(); 

        for ( int i = 0; i < reads.size(); i++ ) {
            SAMRecord read = reads.get(i);
@ -124,11 +120,11 @@ public class CoverageEvalWalker extends LocusWalker<AlleleFrequencyEstimate, Str
        return "";
    }

-    public String reduce(AlleleFrequencyEstimate alleleFreq, String sum) {
-        GenomeLoc a =  GenomeLocParser.parseGenomeLoc("chr1:42971309");
-        if ((alleleFreq != null && alleleFreq.lodVsRef >= LOD_THRESHOLD) || (alleleFreq.location == a) )   {
-            String line = GELI_OUTPUT_FORMAT ? alleleFreq.asGeliString() : alleleFreq.asTabularString();
-		    variantsOut.println(line);
+    public String reduce(List<String> alleleFreqLines, String sum) {
+        //GenomeLoc a =  GenomeLocParser.parseGenomeLoc("chr1:42971309");
+        //if ((alleleFreq != null && alleleFreq.lodVsRef >= LOD_THRESHOLD)) { // || (alleleFreq.location == a) )   {
+        for (String line : alleleFreqLines) {
+            variantsOut.println(line);
        }

        return "";
--- a/java/src/org/broadinstitute/sting/utils/ListUtils.java
+++ b/java/src/org/broadinstitute/sting/utils/ListUtils.java
@ -0,0 +1,50 @@
+package org.broadinstitute.sting.utils;
+
+import java.util.List;
+import java.util.HashSet;
+import java.util.ArrayList;
+import java.util.Random;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: andrew
+ * Date: Jul 2, 2009
+ * Time: 3:12:57 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public class ListUtils {
+
+    static Random rand = new Random(12321); //System.currentTimeMillis());
+
+    static public ArrayList<Integer> randomSubsetIndices(int n, int k) {
+        // Returns n random indices drawn with replacement from the range 1..k
+        
+        /*ArrayList<Integer> balls = new ArrayList<Integer>();
+        for (int i=0; i<k; i++) {
+            balls.add(i);
+        } */
+
+        ArrayList<Integer> chosen_balls = new ArrayList <Integer>();
+        for (int i=0; i<n; i++) {
+            //Integer chosen_ball = balls[rand.nextInt(k)];
+            chosen_balls.add(rand.nextInt(k));
+            //balls.remove(chosen_ball);
+        }
+
+        return chosen_balls;
+    }
+
+    static public <T> ArrayList<T> subsetListByIndices(List<Integer> indices, List<T> list) {
+        // Given a list of indices into a list, return those elements of the list list 
+
+        ArrayList<T> subset = new ArrayList<T>();
+
+        for (int i : indices) {
+            subset.add(list.get(i));
+        }
+        
+        return subset;
+    }
+
+
+}
--- a/python/CoverageEval.py
+++ b/python/CoverageEval.py
@ -0,0 +1,70 @@
+#!/usr/bin/env python
+
+import sys
+
+def chopped_line_generator(filename):
+    fin = open(filename)
+    for line in fin:
+        line = line.rstrip()
+        yield line
+
+def subset_list_by_indices(indices, list):
+    subset = []
+    for index in indices:
+        subset.append(list[index])
+    return subset
+
+def chunk_generator(line_gen, key_fields):
+    """Input:
+  line_gen: generator that produces lines with linefeeds chopped off
+  key_fields: field numbers in each record used to determine chunk membership
+Output:
+  locus_chunk: list of consecutive lines that have the same key_fields
+"""
+  
+    locus_chunk = []
+    last_key = ""
+    for line in line_gen:
+        fields = line.split()
+        key = subset_list_by_indices(key_fields, fields)
+        if key == last_key:
+            locus_chunk.append(line)
+        else:
+            last_key =key
+            if locus_chunk != []:
+                yield locus_chunk
+                locus_chunk = []
+    yield locus_chunk
+
+def chunk_stats(chunk):
+    records = 0
+    correct_genotype = 0
+    for record in chunk:
+        fields = record.split()
+        if fields[0] == fields[8]:
+            correct_genotype += 1
+        records += 1
+    return float(correct_genotype) / records
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        sys.exit("Usage: CoverageEval.py geli_file")
+    filename = sys.argv[1]
+
+    fin = open(filename)
+    locus_gen = chunk_generator(chopped_line_generator(filename), (3,4))
+    print "Fraction correct genotype\tCoverage sampled\tLocus\tReference base\tHapmap chip genotype (Max. coverage genotype call for reference calls)"
+    for locus in locus_gen:
+        #print "NEW LOCUS"
+        covs = dict()
+        coverage_chunk_gen = chunk_generator(locus, (1,3,4))
+        for cov_chunk in coverage_chunk_gen:
+            #print "NEW COVERAGE"
+            #print "\n".join(cov_chunk)
+            fields = cov_chunk[0].split()
+            coverage = fields[1]
+            print "\t".join(map(str,("%.2f"%chunk_stats(cov_chunk), coverage, fields[3]+":"+fields[4],fields[5],fields[0])))
+            
+            #covs[coverage] = cov_chunk
+            
+