Added unit tests for function in ListUtils to randomly sample lists with replacement, updated AlleleFrequencyEstimate to provide a callType of HomRef, HetSNP, HomSNP, update indices in CoverageEval.py, and made a lot of changes to CoverageWalker biggest one being that it directly calls SingleSampleGenotyper instead of implementing some parts of SSG itself.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1189 348d0f76-0448-11de-a6fe-93d51630548a
2009-07-08 02:05:40 +00:00 · 2009-07-08 02:05:40 +00:00 · d3daecfc4d
parent 4ba2194b5e
commit d3daecfc4d
4 changed files with 64 additions and 74 deletions
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java
@ -1,8 +1,6 @@
 package org.broadinstitute.sting.playground.gatk.walkers;
 import org.broadinstitute.sting.utils.cmdLine.Argument;
 import org.broadinstitute.sting.playground.utils.IndelLikelihood;
 import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods;
 import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.gatk.refdata.rodGFF;
@ -31,10 +29,19 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
    @Argument(fullName="format_geli", shortName="geli", doc="Output variant calls in Geli/Picard format", required=false) public boolean GELI_OUTPUT_FORMAT = false;
    @Argument(fullName="variants_out", shortName="varout", doc="File to which variants should be written", required=true) public File VARIANTS_FILE;
    @Argument(fullName="min_coverage", shortName="mincov", doc="Mininum coverage to downsample to", required=false) public int min_coverage=1;
    @Argument(fullName="max_coverage", shortName="maxcov", doc="Maximum coverage to downsample to", required=false) public int max_coverage=20;
    @Argument(fullName="downsampling_repeats", shortName="repeat", doc="Number of times to repeat downsampling at each coverage level", required=false) public int downsampling_repeats=20;
    public PrintStream variantsOut;
    SingleSampleGenotyper SSG;
    public void initialize() {
        SSG = new SingleSampleGenotyper();
        SSG.VARIANTS_FILE = VARIANTS_FILE;
        SSG.initialize();
        try {
            variantsOut = new PrintStream(VARIANTS_FILE);
        } catch (FileNotFoundException e) {
@ -43,7 +50,7 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
        }
        String header = GELI_OUTPUT_FORMAT ? AlleleFrequencyEstimate.geliHeaderString() : AlleleFrequencyEstimate.asTabularStringHeader();
-        variantsOut.println("#DownsampledCoverage\tAvailableCoveragt \t"+header);
+        variantsOut.println("DownsampledCoverage\tAvailableCoverage\tHapmapChipGenotype\tGenotypeCallType\t"+header.substring(1));
    }
    public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) {
@ -51,69 +58,45 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
    }
    public List<String> map(RefMetaDataTracker tracker, char ref, LocusContext context) {
        rodGFF hapmap_chip = (rodGFF)tracker.lookup("hapmap-chip", null);
        String hc_genotype;
        if (hapmap_chip != null) {
            hc_genotype = hapmap_chip.getFeature();
        }else{
            hc_genotype = new String(new char[] {ref, ref});
        }
-        //if (tracker.hasROD("hapmap-chip")) {
+            ArrayList<String> GenotypeCalls = new ArrayList<String>();
        ArrayList<String> Gs = new ArrayList<String>();
-        ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
+            List<SAMRecord> reads = context.getReads();
-        String bases = pileup.getBases();
+            List<Integer> offsets = context.getOffsets();
        List<SAMRecord> reads = context.getReads();
        List<Integer> offsets = context.getOffsets();
-        // Iterate over coverage levels
+            int coverage_available = reads.size();
-        int coverage_available = reads.size();
+            List<Integer> coverage_levels = new ArrayList<Integer>();// = {4, 7, 10, 20, Integer.MAX_VALUE};
-        int coverage_levels[] = {4, 10, 20, Integer.MAX_VALUE};
+            for (int coverage = min_coverage; coverage <= max_coverage; coverage++) {
-        int downsampling_repeats = 10; // number of times to random re-sample each coverage_level
+                coverage_levels.add(coverage);
        for (int coverage : coverage_levels) {
            coverage = Math.min(coverage_available, coverage); // don't exceed max available coverage
            for (int r=0; r<downsampling_repeats; r++) {
                List<Integer> subset_indices = ListUtils.randomSubsetIndices(coverage, coverage_available);
                List<SAMRecord> sub_reads = ListUtils.subsetListByIndices(subset_indices, reads);
                List<Integer> sub_offsets = ListUtils.subsetListByIndices(subset_indices, offsets);
                // Call genotypes on subset of reads and offsets
                GenotypeLikelihoods G = callGenotype(tracker, ref, pileup, sub_reads, sub_offsets);
                String geliString = G.toAlleleFrequencyEstimate(context.getLocation(), ref, bases.length(), bases, G.likelihoods, "sample").asGeliString();
                Gs.add(hc_genotype+"\t"+coverage+"\t"+coverage_available+"\t"+geliString);
            }
            coverage_levels.add(coverage_available); // Run on all available reads
            // Iterate over coverage levels
            for (int coverage : coverage_levels) {
                coverage = Math.min(coverage_available, coverage); // don't exceed max available coverage
                for (int r=0; r<downsampling_repeats; r++) {
                    List<Integer> subset_indices = ListUtils.sampleIndicesWithReplacement(coverage, coverage_available);
                    List<SAMRecord> sub_reads = ListUtils.sliceListByIndices(subset_indices, reads);
                    List<Integer> sub_offsets = ListUtils.sliceListByIndices(subset_indices, offsets);
                    LocusContext subContext = new LocusContext(context.getLocation(), sub_reads, sub_offsets);
                    AlleleFrequencyEstimate alleleFreq = SSG.map(tracker, ref, subContext);
                    if (alleleFreq != null && (alleleFreq.lodVsRef >= LOD_THRESHOLD || alleleFreq.lodVsRef <= LOD_THRESHOLD)) {
                        GenotypeCalls.add(coverage+"\t"+coverage_available+"\t"+hc_genotype+"\t"+alleleFreq.callType()+"\t"+alleleFreq.asGeliString());
                    }
                }
            }
            return GenotypeCalls;
        }else{
            return new ArrayList<String>();
        }
        return Gs;
    }
    /**
     * Calls the underlying, single locus genotype of the sample
     *
     * @param tracker  the meta data tracker
     * @param ref      the reference base
     * @param pileup   the pileup object for the given locus
     * @param reads    the reads that overlap this locus
     * @param offsets  the offsets per read that identify the base at this locus
     * @return the likelihoods per genotype
     */
    private GenotypeLikelihoods callGenotype(RefMetaDataTracker tracker, char ref, ReadBackedPileup pileup, List<SAMRecord> reads, List<Integer> offsets) {
        GenotypeLikelihoods G;
        G = new GenotypeLikelihoods(); 
        for ( int i = 0; i < reads.size(); i++ ) {
            SAMRecord read = reads.get(i);
            int offset = offsets.get(i);
            G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]);
        }
        G.ApplyPrior(ref, 'N', -1);
        return G;
    }
    public String reduceInit() {
@ -121,8 +104,6 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
    }
    public String reduce(List<String> alleleFreqLines, String sum) {
        //GenomeLoc a =  GenomeLocParser.parseGenomeLoc("chr1:42971309");
        //if ((alleleFreq != null && alleleFreq.lodVsRef >= LOD_THRESHOLD)) { // || (alleleFreq.location == a) )   {
        for (String line : alleleFreqLines) {
            variantsOut.println(line);
        }
--- a/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java
+++ b/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java
@ -248,4 +248,14 @@ public class AlleleFrequencyEstimate {
    {
        return this.posteriors[(int)this.qstar * this.N];
    }
    public String callType() {
        // Returns a string indicating whether the call is homozygous reference, heterozygous SNP, or homozygous SNP
        String[] callTypeString = {"HomozygousSNP", "HeterozygousSNP", "HomozygousReference"};
        String genotype = genotype();
        int ref_matches = (genotype.charAt(0) == ref ? 1 : 0) + (genotype.charAt(1) == ref ? 1 : 0);
        return callTypeString[ref_matches];
    }
 }
--- a/java/src/org/broadinstitute/sting/utils/ListUtils.java
+++ b/java/src/org/broadinstitute/sting/utils/ListUtils.java
@ -16,14 +16,9 @@ public class ListUtils {
    static Random rand = new Random(12321); //System.currentTimeMillis());
-    static public ArrayList<Integer> randomSubsetIndices(int n, int k) {
+    static public ArrayList<Integer> sampleIndicesWithReplacement(int n, int k) {
        // Returns n random indices drawn with replacement from the range 1..k
        /*ArrayList<Integer> balls = new ArrayList<Integer>();
        for (int i=0; i<k; i++) {
            balls.add(i);
        } */
        ArrayList<Integer> chosen_balls = new ArrayList <Integer>();
        for (int i=0; i<n; i++) {
            //Integer chosen_ball = balls[rand.nextInt(k)];
@ -34,8 +29,9 @@ public class ListUtils {
        return chosen_balls;
    }
-    static public <T> ArrayList<T> subsetListByIndices(List<Integer> indices, List<T> list) {
+    static public <T> ArrayList<T> sliceListByIndices(List<Integer> indices, List<T> list) {
-        // Given a list of indices into a list, return those elements of the list list 
+        // Given a list of indices into a list, return those elements of the list with the possibility
        // of drawing list elements multiple times
        ArrayList<T> subset = new ArrayList<T>();
--- a/python/CoverageEval.py
+++ b/python/CoverageEval.py
@ -4,6 +4,7 @@ import sys
 def chopped_line_generator(filename):
    fin = open(filename)
    fin.readline() # pull off header
    for line in fin:
        line = line.rstrip()
        yield line
@ -24,16 +25,18 @@ Output:
    locus_chunk = []
    last_key = ""
    first_line = True
    for line in line_gen:
        fields = line.split()
        key = subset_list_by_indices(key_fields, fields)
-        if key == last_key:
+        if key == last_key or first_line:
            locus_chunk.append(line)
            first_line = False
        else:
            last_key =key
            if locus_chunk != []:
                yield locus_chunk
-                locus_chunk = []
+                locus_chunk = [line]
        last_key = key
    yield locus_chunk
 def chunk_stats(chunk):
@ -41,7 +44,7 @@ def chunk_stats(chunk):
    correct_genotype = 0
    for record in chunk:
        fields = record.split()
-        if fields[0] == fields[8]:
+        if fields[2] == fields[9]:
            correct_genotype += 1
        records += 1
    return float(correct_genotype) / records
@ -52,18 +55,18 @@ if __name__ == "__main__":
    filename = sys.argv[1]
    fin = open(filename)
-    locus_gen = chunk_generator(chopped_line_generator(filename), (3,4))
+    locus_gen = chunk_generator(chopped_line_generator(filename), (4,5))
    print "Fraction correct genotype\tCoverage sampled\tLocus\tReference base\tHapmap chip genotype (Max. coverage genotype call for reference calls)"
    for locus in locus_gen:
        #print "NEW LOCUS"
        covs = dict()
-        coverage_chunk_gen = chunk_generator(locus, (1,3,4))
+        coverage_chunk_gen = chunk_generator(locus, (0,4,5))
        for cov_chunk in coverage_chunk_gen:
            #print "NEW COVERAGE"
            #print "\n".join(cov_chunk)
            fields = cov_chunk[0].split()
            coverage = fields[1]
-            print "\t".join(map(str,("%.2f"%chunk_stats(cov_chunk), coverage, fields[3]+":"+fields[4],fields[5],fields[0])))
+            print "\t".join(map(str,("%.2f"%chunk_stats(cov_chunk), coverage, fields[4]+":"+fields[5],fields[6],fields[2])))
            #covs[coverage] = cov_chunk