diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java index 99d1a1a37..1599c815b 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java @@ -1,8 +1,6 @@ package org.broadinstitute.sting.playground.gatk.walkers; import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.playground.utils.IndelLikelihood; -import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods; import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.rodGFF; @@ -31,10 +29,19 @@ public class CoverageEvalWalker extends LocusWalker, String> { @Argument(fullName="format_geli", shortName="geli", doc="Output variant calls in Geli/Picard format", required=false) public boolean GELI_OUTPUT_FORMAT = false; @Argument(fullName="variants_out", shortName="varout", doc="File to which variants should be written", required=true) public File VARIANTS_FILE; + @Argument(fullName="min_coverage", shortName="mincov", doc="Mininum coverage to downsample to", required=false) public int min_coverage=1; + @Argument(fullName="max_coverage", shortName="maxcov", doc="Maximum coverage to downsample to", required=false) public int max_coverage=20; + @Argument(fullName="downsampling_repeats", shortName="repeat", doc="Number of times to repeat downsampling at each coverage level", required=false) public int downsampling_repeats=20; public PrintStream variantsOut; + SingleSampleGenotyper SSG; + public void initialize() { + SSG = new SingleSampleGenotyper(); + SSG.VARIANTS_FILE = VARIANTS_FILE; + SSG.initialize(); + try { variantsOut = new PrintStream(VARIANTS_FILE); } catch (FileNotFoundException e) { @@ -43,7 +50,7 @@ public class CoverageEvalWalker extends LocusWalker, String> { } String header = GELI_OUTPUT_FORMAT ? AlleleFrequencyEstimate.geliHeaderString() : AlleleFrequencyEstimate.asTabularStringHeader(); - variantsOut.println("#DownsampledCoverage\tAvailableCoveragt \t"+header); + variantsOut.println("DownsampledCoverage\tAvailableCoverage\tHapmapChipGenotype\tGenotypeCallType\t"+header.substring(1)); } public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) { @@ -51,69 +58,45 @@ public class CoverageEvalWalker extends LocusWalker, String> { } public List map(RefMetaDataTracker tracker, char ref, LocusContext context) { + rodGFF hapmap_chip = (rodGFF)tracker.lookup("hapmap-chip", null); String hc_genotype; + if (hapmap_chip != null) { hc_genotype = hapmap_chip.getFeature(); - }else{ - hc_genotype = new String(new char[] {ref, ref}); - } - //if (tracker.hasROD("hapmap-chip")) { - ArrayList Gs = new ArrayList(); + ArrayList GenotypeCalls = new ArrayList(); - ReadBackedPileup pileup = new ReadBackedPileup(ref, context); - String bases = pileup.getBases(); - List reads = context.getReads(); - List offsets = context.getOffsets(); + List reads = context.getReads(); + List offsets = context.getOffsets(); - // Iterate over coverage levels - int coverage_available = reads.size(); - int coverage_levels[] = {4, 10, 20, Integer.MAX_VALUE}; - int downsampling_repeats = 10; // number of times to random re-sample each coverage_level - for (int coverage : coverage_levels) { - coverage = Math.min(coverage_available, coverage); // don't exceed max available coverage - for (int r=0; r subset_indices = ListUtils.randomSubsetIndices(coverage, coverage_available); - List sub_reads = ListUtils.subsetListByIndices(subset_indices, reads); - List sub_offsets = ListUtils.subsetListByIndices(subset_indices, offsets); - - // Call genotypes on subset of reads and offsets - GenotypeLikelihoods G = callGenotype(tracker, ref, pileup, sub_reads, sub_offsets); - String geliString = G.toAlleleFrequencyEstimate(context.getLocation(), ref, bases.length(), bases, G.likelihoods, "sample").asGeliString(); - - Gs.add(hc_genotype+"\t"+coverage+"\t"+coverage_available+"\t"+geliString); + int coverage_available = reads.size(); + List coverage_levels = new ArrayList();// = {4, 7, 10, 20, Integer.MAX_VALUE}; + for (int coverage = min_coverage; coverage <= max_coverage; coverage++) { + coverage_levels.add(coverage); } + coverage_levels.add(coverage_available); // Run on all available reads + + // Iterate over coverage levels + for (int coverage : coverage_levels) { + coverage = Math.min(coverage_available, coverage); // don't exceed max available coverage + for (int r=0; r subset_indices = ListUtils.sampleIndicesWithReplacement(coverage, coverage_available); + List sub_reads = ListUtils.sliceListByIndices(subset_indices, reads); + List sub_offsets = ListUtils.sliceListByIndices(subset_indices, offsets); + + LocusContext subContext = new LocusContext(context.getLocation(), sub_reads, sub_offsets); + AlleleFrequencyEstimate alleleFreq = SSG.map(tracker, ref, subContext); + + if (alleleFreq != null && (alleleFreq.lodVsRef >= LOD_THRESHOLD || alleleFreq.lodVsRef <= LOD_THRESHOLD)) { + GenotypeCalls.add(coverage+"\t"+coverage_available+"\t"+hc_genotype+"\t"+alleleFreq.callType()+"\t"+alleleFreq.asGeliString()); + } + } + } + return GenotypeCalls; + }else{ + return new ArrayList(); } - - return Gs; - } - - /** - * Calls the underlying, single locus genotype of the sample - * - * @param tracker the meta data tracker - * @param ref the reference base - * @param pileup the pileup object for the given locus - * @param reads the reads that overlap this locus - * @param offsets the offsets per read that identify the base at this locus - * @return the likelihoods per genotype - */ - private GenotypeLikelihoods callGenotype(RefMetaDataTracker tracker, char ref, ReadBackedPileup pileup, List reads, List offsets) { - GenotypeLikelihoods G; - - G = new GenotypeLikelihoods(); - - for ( int i = 0; i < reads.size(); i++ ) { - SAMRecord read = reads.get(i); - int offset = offsets.get(i); - - G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]); - } - - G.ApplyPrior(ref, 'N', -1); - - return G; } public String reduceInit() { @@ -121,8 +104,6 @@ public class CoverageEvalWalker extends LocusWalker, String> { } public String reduce(List alleleFreqLines, String sum) { - //GenomeLoc a = GenomeLocParser.parseGenomeLoc("chr1:42971309"); - //if ((alleleFreq != null && alleleFreq.lodVsRef >= LOD_THRESHOLD)) { // || (alleleFreq.location == a) ) { for (String line : alleleFreqLines) { variantsOut.println(line); } diff --git a/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java b/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java index 05afdf8ce..24467274f 100755 --- a/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java +++ b/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java @@ -248,4 +248,14 @@ public class AlleleFrequencyEstimate { { return this.posteriors[(int)this.qstar * this.N]; } + + public String callType() { + // Returns a string indicating whether the call is homozygous reference, heterozygous SNP, or homozygous SNP + + String[] callTypeString = {"HomozygousSNP", "HeterozygousSNP", "HomozygousReference"}; + String genotype = genotype(); + int ref_matches = (genotype.charAt(0) == ref ? 1 : 0) + (genotype.charAt(1) == ref ? 1 : 0); + return callTypeString[ref_matches]; + } + } diff --git a/java/src/org/broadinstitute/sting/utils/ListUtils.java b/java/src/org/broadinstitute/sting/utils/ListUtils.java index 5074d4eb1..d8581c06d 100644 --- a/java/src/org/broadinstitute/sting/utils/ListUtils.java +++ b/java/src/org/broadinstitute/sting/utils/ListUtils.java @@ -16,13 +16,8 @@ public class ListUtils { static Random rand = new Random(12321); //System.currentTimeMillis()); - static public ArrayList randomSubsetIndices(int n, int k) { + static public ArrayList sampleIndicesWithReplacement(int n, int k) { // Returns n random indices drawn with replacement from the range 1..k - - /*ArrayList balls = new ArrayList(); - for (int i=0; i chosen_balls = new ArrayList (); for (int i=0; i ArrayList subsetListByIndices(List indices, List list) { - // Given a list of indices into a list, return those elements of the list list + static public ArrayList sliceListByIndices(List indices, List list) { + // Given a list of indices into a list, return those elements of the list with the possibility + // of drawing list elements multiple times ArrayList subset = new ArrayList(); diff --git a/python/CoverageEval.py b/python/CoverageEval.py index 82cc9aa8f..34ba63c22 100755 --- a/python/CoverageEval.py +++ b/python/CoverageEval.py @@ -4,6 +4,7 @@ import sys def chopped_line_generator(filename): fin = open(filename) + fin.readline() # pull off header for line in fin: line = line.rstrip() yield line @@ -24,16 +25,18 @@ Output: locus_chunk = [] last_key = "" + first_line = True for line in line_gen: fields = line.split() key = subset_list_by_indices(key_fields, fields) - if key == last_key: + if key == last_key or first_line: locus_chunk.append(line) + first_line = False else: - last_key =key if locus_chunk != []: yield locus_chunk - locus_chunk = [] + locus_chunk = [line] + last_key = key yield locus_chunk def chunk_stats(chunk): @@ -41,7 +44,7 @@ def chunk_stats(chunk): correct_genotype = 0 for record in chunk: fields = record.split() - if fields[0] == fields[8]: + if fields[2] == fields[9]: correct_genotype += 1 records += 1 return float(correct_genotype) / records @@ -52,18 +55,18 @@ if __name__ == "__main__": filename = sys.argv[1] fin = open(filename) - locus_gen = chunk_generator(chopped_line_generator(filename), (3,4)) + locus_gen = chunk_generator(chopped_line_generator(filename), (4,5)) print "Fraction correct genotype\tCoverage sampled\tLocus\tReference base\tHapmap chip genotype (Max. coverage genotype call for reference calls)" for locus in locus_gen: #print "NEW LOCUS" covs = dict() - coverage_chunk_gen = chunk_generator(locus, (1,3,4)) + coverage_chunk_gen = chunk_generator(locus, (0,4,5)) for cov_chunk in coverage_chunk_gen: #print "NEW COVERAGE" #print "\n".join(cov_chunk) fields = cov_chunk[0].split() coverage = fields[1] - print "\t".join(map(str,("%.2f"%chunk_stats(cov_chunk), coverage, fields[3]+":"+fields[4],fields[5],fields[0]))) + print "\t".join(map(str,("%.2f"%chunk_stats(cov_chunk), coverage, fields[4]+":"+fields[5],fields[6],fields[2]))) #covs[coverage] = cov_chunk