Added unit tests for function in ListUtils to randomly sample lists with replacement, updated AlleleFrequencyEstimate to provide a callType of HomRef, HetSNP, HomSNP, update indices in CoverageEval.py, and made a lot of changes to CoverageWalker biggest one being that it directly calls SingleSampleGenotyper instead of implementing some parts of SSG itself.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1189 348d0f76-0448-11de-a6fe-93d51630548a
2009-07-08 02:05:40 +00:00 · 2009-07-08 02:05:40 +00:00 · d3daecfc4d
parent 4ba2194b5e
commit d3daecfc4d
4 changed files with 64 additions and 74 deletions
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java
@ -1,8 +1,6 @@
 package org.broadinstitute.sting.playground.gatk.walkers;

 import org.broadinstitute.sting.utils.cmdLine.Argument;
-import org.broadinstitute.sting.playground.utils.IndelLikelihood;
-import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods;
 import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.gatk.refdata.rodGFF;
@ -31,10 +29,19 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
    @Argument(fullName="format_geli", shortName="geli", doc="Output variant calls in Geli/Picard format", required=false) public boolean GELI_OUTPUT_FORMAT = false;

    @Argument(fullName="variants_out", shortName="varout", doc="File to which variants should be written", required=true) public File VARIANTS_FILE;
+    @Argument(fullName="min_coverage", shortName="mincov", doc="Mininum coverage to downsample to", required=false) public int min_coverage=1;
+    @Argument(fullName="max_coverage", shortName="maxcov", doc="Maximum coverage to downsample to", required=false) public int max_coverage=20;
+    @Argument(fullName="downsampling_repeats", shortName="repeat", doc="Number of times to repeat downsampling at each coverage level", required=false) public int downsampling_repeats=20;

    public PrintStream variantsOut;

+    SingleSampleGenotyper SSG;
+
    public void initialize() {
+        SSG = new SingleSampleGenotyper();
+        SSG.VARIANTS_FILE = VARIANTS_FILE;
+        SSG.initialize();
+
        try {
            variantsOut = new PrintStream(VARIANTS_FILE);
        } catch (FileNotFoundException e) {
@ -43,7 +50,7 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
        }
                
        String header = GELI_OUTPUT_FORMAT ? AlleleFrequencyEstimate.geliHeaderString() : AlleleFrequencyEstimate.asTabularStringHeader();
-        variantsOut.println("#DownsampledCoverage\tAvailableCoveragt \t"+header);
+        variantsOut.println("DownsampledCoverage\tAvailableCoverage\tHapmapChipGenotype\tGenotypeCallType\t"+header.substring(1));
    }

    public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) {
@ -51,69 +58,45 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
    }

    public List<String> map(RefMetaDataTracker tracker, char ref, LocusContext context) {
+
        rodGFF hapmap_chip = (rodGFF)tracker.lookup("hapmap-chip", null);
        String hc_genotype;
+
        if (hapmap_chip != null) {
            hc_genotype = hapmap_chip.getFeature();
-        }else{
-            hc_genotype = new String(new char[] {ref, ref});
-        }

-        //if (tracker.hasROD("hapmap-chip")) {
-        ArrayList<String> Gs = new ArrayList<String>();
+            ArrayList<String> GenotypeCalls = new ArrayList<String>();

-        ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
-        String bases = pileup.getBases();
-        List<SAMRecord> reads = context.getReads();
-        List<Integer> offsets = context.getOffsets();
+            List<SAMRecord> reads = context.getReads();
+            List<Integer> offsets = context.getOffsets();

-        // Iterate over coverage levels
-        int coverage_available = reads.size();
-        int coverage_levels[] = {4, 10, 20, Integer.MAX_VALUE};
-        int downsampling_repeats = 10; // number of times to random re-sample each coverage_level
-        for (int coverage : coverage_levels) {
-            coverage = Math.min(coverage_available, coverage); // don't exceed max available coverage
-            for (int r=0; r<downsampling_repeats; r++) {
-                List<Integer> subset_indices = ListUtils.randomSubsetIndices(coverage, coverage_available);
-                List<SAMRecord> sub_reads = ListUtils.subsetListByIndices(subset_indices, reads);
-                List<Integer> sub_offsets = ListUtils.subsetListByIndices(subset_indices, offsets);
-
-                // Call genotypes on subset of reads and offsets
-                GenotypeLikelihoods G = callGenotype(tracker, ref, pileup, sub_reads, sub_offsets);
-                String geliString = G.toAlleleFrequencyEstimate(context.getLocation(), ref, bases.length(), bases, G.likelihoods, "sample").asGeliString();
-
-                Gs.add(hc_genotype+"\t"+coverage+"\t"+coverage_available+"\t"+geliString);
+            int coverage_available = reads.size();
+            List<Integer> coverage_levels = new ArrayList<Integer>();// = {4, 7, 10, 20, Integer.MAX_VALUE};
+            for (int coverage = min_coverage; coverage <= max_coverage; coverage++) {
+                coverage_levels.add(coverage);
            }
+            coverage_levels.add(coverage_available); // Run on all available reads
+
+            // Iterate over coverage levels
+            for (int coverage : coverage_levels) {
+                coverage = Math.min(coverage_available, coverage); // don't exceed max available coverage
+                for (int r=0; r<downsampling_repeats; r++) {
+                    List<Integer> subset_indices = ListUtils.sampleIndicesWithReplacement(coverage, coverage_available);
+                    List<SAMRecord> sub_reads = ListUtils.sliceListByIndices(subset_indices, reads);
+                    List<Integer> sub_offsets = ListUtils.sliceListByIndices(subset_indices, offsets);
+
+                    LocusContext subContext = new LocusContext(context.getLocation(), sub_reads, sub_offsets);
+                    AlleleFrequencyEstimate alleleFreq = SSG.map(tracker, ref, subContext);
+
+                    if (alleleFreq != null && (alleleFreq.lodVsRef >= LOD_THRESHOLD || alleleFreq.lodVsRef <= LOD_THRESHOLD)) {
+                        GenotypeCalls.add(coverage+"\t"+coverage_available+"\t"+hc_genotype+"\t"+alleleFreq.callType()+"\t"+alleleFreq.asGeliString());
+                    }
+                }
+            }
+            return GenotypeCalls;
+        }else{
+            return new ArrayList<String>();
        }
-
-        return Gs;
-    }
-
-    /**
-     * Calls the underlying, single locus genotype of the sample
-     *
-     * @param tracker  the meta data tracker
-     * @param ref      the reference base
-     * @param pileup   the pileup object for the given locus
-     * @param reads    the reads that overlap this locus
-     * @param offsets  the offsets per read that identify the base at this locus
-     * @return the likelihoods per genotype
-     */
-    private GenotypeLikelihoods callGenotype(RefMetaDataTracker tracker, char ref, ReadBackedPileup pileup, List<SAMRecord> reads, List<Integer> offsets) {
-        GenotypeLikelihoods G;
-
-        G = new GenotypeLikelihoods(); 
-
-        for ( int i = 0; i < reads.size(); i++ ) {
-            SAMRecord read = reads.get(i);
-            int offset = offsets.get(i);
-
-            G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]);
-        }
-
-        G.ApplyPrior(ref, 'N', -1);
-
-        return G;
    }

    public String reduceInit() {
@ -121,8 +104,6 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
    }

    public String reduce(List<String> alleleFreqLines, String sum) {
-        //GenomeLoc a =  GenomeLocParser.parseGenomeLoc("chr1:42971309");
-        //if ((alleleFreq != null && alleleFreq.lodVsRef >= LOD_THRESHOLD)) { // || (alleleFreq.location == a) )   {
        for (String line : alleleFreqLines) {
            variantsOut.println(line);
        }
--- a/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java
+++ b/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java
@ -248,4 +248,14 @@ public class AlleleFrequencyEstimate {
    {
        return this.posteriors[(int)this.qstar * this.N];
    }
+
+    public String callType() {
+        // Returns a string indicating whether the call is homozygous reference, heterozygous SNP, or homozygous SNP
+
+        String[] callTypeString = {"HomozygousSNP", "HeterozygousSNP", "HomozygousReference"};
+        String genotype = genotype();
+        int ref_matches = (genotype.charAt(0) == ref ? 1 : 0) + (genotype.charAt(1) == ref ? 1 : 0);
+        return callTypeString[ref_matches];
+    }
+
 }
--- a/java/src/org/broadinstitute/sting/utils/ListUtils.java
+++ b/java/src/org/broadinstitute/sting/utils/ListUtils.java
@ -16,13 +16,8 @@ public class ListUtils {

    static Random rand = new Random(12321); //System.currentTimeMillis());

-    static public ArrayList<Integer> randomSubsetIndices(int n, int k) {
+    static public ArrayList<Integer> sampleIndicesWithReplacement(int n, int k) {
        // Returns n random indices drawn with replacement from the range 1..k
-        
-        /*ArrayList<Integer> balls = new ArrayList<Integer>();
-        for (int i=0; i<k; i++) {
-            balls.add(i);
-        } */

        ArrayList<Integer> chosen_balls = new ArrayList <Integer>();
        for (int i=0; i<n; i++) {
@ -34,8 +29,9 @@ public class ListUtils {
        return chosen_balls;
    }

-    static public <T> ArrayList<T> subsetListByIndices(List<Integer> indices, List<T> list) {
-        // Given a list of indices into a list, return those elements of the list list 
+    static public <T> ArrayList<T> sliceListByIndices(List<Integer> indices, List<T> list) {
+        // Given a list of indices into a list, return those elements of the list with the possibility
+        // of drawing list elements multiple times

        ArrayList<T> subset = new ArrayList<T>();

--- a/python/CoverageEval.py
+++ b/python/CoverageEval.py
@ -4,6 +4,7 @@ import sys

 def chopped_line_generator(filename):
    fin = open(filename)
+    fin.readline() # pull off header
    for line in fin:
        line = line.rstrip()
        yield line
@ -24,16 +25,18 @@ Output:
  
    locus_chunk = []
    last_key = ""
+    first_line = True
    for line in line_gen:
        fields = line.split()
        key = subset_list_by_indices(key_fields, fields)
-        if key == last_key:
+        if key == last_key or first_line:
            locus_chunk.append(line)
+            first_line = False
        else:
-            last_key =key
            if locus_chunk != []:
                yield locus_chunk
-                locus_chunk = []
+                locus_chunk = [line]
+        last_key = key
    yield locus_chunk

 def chunk_stats(chunk):
@ -41,7 +44,7 @@ def chunk_stats(chunk):
    correct_genotype = 0
    for record in chunk:
        fields = record.split()
-        if fields[0] == fields[8]:
+        if fields[2] == fields[9]:
            correct_genotype += 1
        records += 1
    return float(correct_genotype) / records
@ -52,18 +55,18 @@ if __name__ == "__main__":
    filename = sys.argv[1]

    fin = open(filename)
-    locus_gen = chunk_generator(chopped_line_generator(filename), (3,4))
+    locus_gen = chunk_generator(chopped_line_generator(filename), (4,5))
    print "Fraction correct genotype\tCoverage sampled\tLocus\tReference base\tHapmap chip genotype (Max. coverage genotype call for reference calls)"
    for locus in locus_gen:
        #print "NEW LOCUS"
        covs = dict()
-        coverage_chunk_gen = chunk_generator(locus, (1,3,4))
+        coverage_chunk_gen = chunk_generator(locus, (0,4,5))
        for cov_chunk in coverage_chunk_gen:
            #print "NEW COVERAGE"
            #print "\n".join(cov_chunk)
            fields = cov_chunk[0].split()
            coverage = fields[1]
-            print "\t".join(map(str,("%.2f"%chunk_stats(cov_chunk), coverage, fields[3]+":"+fields[4],fields[5],fields[0])))
+            print "\t".join(map(str,("%.2f"%chunk_stats(cov_chunk), coverage, fields[4]+":"+fields[5],fields[6],fields[2])))
            
            #covs[coverage] = cov_chunk