Added unit tests for function in ListUtils to randomly sample lists with replacement, updated AlleleFrequencyEstimate to provide a callType of HomRef, HetSNP, HomSNP, update indices in CoverageEval.py, and made a lot of changes to CoverageWalker biggest one being that it directly calls SingleSampleGenotyper instead of implementing some parts of SSG itself.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1189 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
andrewk 2009-07-08 02:05:40 +00:00
parent 4ba2194b5e
commit d3daecfc4d
4 changed files with 64 additions and 74 deletions

View File

@ -1,8 +1,6 @@
package org.broadinstitute.sting.playground.gatk.walkers; package org.broadinstitute.sting.playground.gatk.walkers;
import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.playground.utils.IndelLikelihood;
import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods;
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate; import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodGFF; import org.broadinstitute.sting.gatk.refdata.rodGFF;
@ -31,10 +29,19 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
@Argument(fullName="format_geli", shortName="geli", doc="Output variant calls in Geli/Picard format", required=false) public boolean GELI_OUTPUT_FORMAT = false; @Argument(fullName="format_geli", shortName="geli", doc="Output variant calls in Geli/Picard format", required=false) public boolean GELI_OUTPUT_FORMAT = false;
@Argument(fullName="variants_out", shortName="varout", doc="File to which variants should be written", required=true) public File VARIANTS_FILE; @Argument(fullName="variants_out", shortName="varout", doc="File to which variants should be written", required=true) public File VARIANTS_FILE;
@Argument(fullName="min_coverage", shortName="mincov", doc="Mininum coverage to downsample to", required=false) public int min_coverage=1;
@Argument(fullName="max_coverage", shortName="maxcov", doc="Maximum coverage to downsample to", required=false) public int max_coverage=20;
@Argument(fullName="downsampling_repeats", shortName="repeat", doc="Number of times to repeat downsampling at each coverage level", required=false) public int downsampling_repeats=20;
public PrintStream variantsOut; public PrintStream variantsOut;
SingleSampleGenotyper SSG;
public void initialize() { public void initialize() {
SSG = new SingleSampleGenotyper();
SSG.VARIANTS_FILE = VARIANTS_FILE;
SSG.initialize();
try { try {
variantsOut = new PrintStream(VARIANTS_FILE); variantsOut = new PrintStream(VARIANTS_FILE);
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
@ -43,7 +50,7 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
} }
String header = GELI_OUTPUT_FORMAT ? AlleleFrequencyEstimate.geliHeaderString() : AlleleFrequencyEstimate.asTabularStringHeader(); String header = GELI_OUTPUT_FORMAT ? AlleleFrequencyEstimate.geliHeaderString() : AlleleFrequencyEstimate.asTabularStringHeader();
variantsOut.println("#DownsampledCoverage\tAvailableCoveragt \t"+header); variantsOut.println("DownsampledCoverage\tAvailableCoverage\tHapmapChipGenotype\tGenotypeCallType\t"+header.substring(1));
} }
public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) { public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) {
@ -51,69 +58,45 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
} }
public List<String> map(RefMetaDataTracker tracker, char ref, LocusContext context) { public List<String> map(RefMetaDataTracker tracker, char ref, LocusContext context) {
rodGFF hapmap_chip = (rodGFF)tracker.lookup("hapmap-chip", null); rodGFF hapmap_chip = (rodGFF)tracker.lookup("hapmap-chip", null);
String hc_genotype; String hc_genotype;
if (hapmap_chip != null) { if (hapmap_chip != null) {
hc_genotype = hapmap_chip.getFeature(); hc_genotype = hapmap_chip.getFeature();
}else{
hc_genotype = new String(new char[] {ref, ref});
}
//if (tracker.hasROD("hapmap-chip")) { ArrayList<String> GenotypeCalls = new ArrayList<String>();
ArrayList<String> Gs = new ArrayList<String>();
ReadBackedPileup pileup = new ReadBackedPileup(ref, context); List<SAMRecord> reads = context.getReads();
String bases = pileup.getBases(); List<Integer> offsets = context.getOffsets();
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
// Iterate over coverage levels int coverage_available = reads.size();
int coverage_available = reads.size(); List<Integer> coverage_levels = new ArrayList<Integer>();// = {4, 7, 10, 20, Integer.MAX_VALUE};
int coverage_levels[] = {4, 10, 20, Integer.MAX_VALUE}; for (int coverage = min_coverage; coverage <= max_coverage; coverage++) {
int downsampling_repeats = 10; // number of times to random re-sample each coverage_level coverage_levels.add(coverage);
for (int coverage : coverage_levels) {
coverage = Math.min(coverage_available, coverage); // don't exceed max available coverage
for (int r=0; r<downsampling_repeats; r++) {
List<Integer> subset_indices = ListUtils.randomSubsetIndices(coverage, coverage_available);
List<SAMRecord> sub_reads = ListUtils.subsetListByIndices(subset_indices, reads);
List<Integer> sub_offsets = ListUtils.subsetListByIndices(subset_indices, offsets);
// Call genotypes on subset of reads and offsets
GenotypeLikelihoods G = callGenotype(tracker, ref, pileup, sub_reads, sub_offsets);
String geliString = G.toAlleleFrequencyEstimate(context.getLocation(), ref, bases.length(), bases, G.likelihoods, "sample").asGeliString();
Gs.add(hc_genotype+"\t"+coverage+"\t"+coverage_available+"\t"+geliString);
} }
coverage_levels.add(coverage_available); // Run on all available reads
// Iterate over coverage levels
for (int coverage : coverage_levels) {
coverage = Math.min(coverage_available, coverage); // don't exceed max available coverage
for (int r=0; r<downsampling_repeats; r++) {
List<Integer> subset_indices = ListUtils.sampleIndicesWithReplacement(coverage, coverage_available);
List<SAMRecord> sub_reads = ListUtils.sliceListByIndices(subset_indices, reads);
List<Integer> sub_offsets = ListUtils.sliceListByIndices(subset_indices, offsets);
LocusContext subContext = new LocusContext(context.getLocation(), sub_reads, sub_offsets);
AlleleFrequencyEstimate alleleFreq = SSG.map(tracker, ref, subContext);
if (alleleFreq != null && (alleleFreq.lodVsRef >= LOD_THRESHOLD || alleleFreq.lodVsRef <= LOD_THRESHOLD)) {
GenotypeCalls.add(coverage+"\t"+coverage_available+"\t"+hc_genotype+"\t"+alleleFreq.callType()+"\t"+alleleFreq.asGeliString());
}
}
}
return GenotypeCalls;
}else{
return new ArrayList<String>();
} }
return Gs;
}
/**
* Calls the underlying, single locus genotype of the sample
*
* @param tracker the meta data tracker
* @param ref the reference base
* @param pileup the pileup object for the given locus
* @param reads the reads that overlap this locus
* @param offsets the offsets per read that identify the base at this locus
* @return the likelihoods per genotype
*/
private GenotypeLikelihoods callGenotype(RefMetaDataTracker tracker, char ref, ReadBackedPileup pileup, List<SAMRecord> reads, List<Integer> offsets) {
GenotypeLikelihoods G;
G = new GenotypeLikelihoods();
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]);
}
G.ApplyPrior(ref, 'N', -1);
return G;
} }
public String reduceInit() { public String reduceInit() {
@ -121,8 +104,6 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
} }
public String reduce(List<String> alleleFreqLines, String sum) { public String reduce(List<String> alleleFreqLines, String sum) {
//GenomeLoc a = GenomeLocParser.parseGenomeLoc("chr1:42971309");
//if ((alleleFreq != null && alleleFreq.lodVsRef >= LOD_THRESHOLD)) { // || (alleleFreq.location == a) ) {
for (String line : alleleFreqLines) { for (String line : alleleFreqLines) {
variantsOut.println(line); variantsOut.println(line);
} }

View File

@ -248,4 +248,14 @@ public class AlleleFrequencyEstimate {
{ {
return this.posteriors[(int)this.qstar * this.N]; return this.posteriors[(int)this.qstar * this.N];
} }
public String callType() {
// Returns a string indicating whether the call is homozygous reference, heterozygous SNP, or homozygous SNP
String[] callTypeString = {"HomozygousSNP", "HeterozygousSNP", "HomozygousReference"};
String genotype = genotype();
int ref_matches = (genotype.charAt(0) == ref ? 1 : 0) + (genotype.charAt(1) == ref ? 1 : 0);
return callTypeString[ref_matches];
}
} }

View File

@ -16,14 +16,9 @@ public class ListUtils {
static Random rand = new Random(12321); //System.currentTimeMillis()); static Random rand = new Random(12321); //System.currentTimeMillis());
static public ArrayList<Integer> randomSubsetIndices(int n, int k) { static public ArrayList<Integer> sampleIndicesWithReplacement(int n, int k) {
// Returns n random indices drawn with replacement from the range 1..k // Returns n random indices drawn with replacement from the range 1..k
/*ArrayList<Integer> balls = new ArrayList<Integer>();
for (int i=0; i<k; i++) {
balls.add(i);
} */
ArrayList<Integer> chosen_balls = new ArrayList <Integer>(); ArrayList<Integer> chosen_balls = new ArrayList <Integer>();
for (int i=0; i<n; i++) { for (int i=0; i<n; i++) {
//Integer chosen_ball = balls[rand.nextInt(k)]; //Integer chosen_ball = balls[rand.nextInt(k)];
@ -34,8 +29,9 @@ public class ListUtils {
return chosen_balls; return chosen_balls;
} }
static public <T> ArrayList<T> subsetListByIndices(List<Integer> indices, List<T> list) { static public <T> ArrayList<T> sliceListByIndices(List<Integer> indices, List<T> list) {
// Given a list of indices into a list, return those elements of the list list // Given a list of indices into a list, return those elements of the list with the possibility
// of drawing list elements multiple times
ArrayList<T> subset = new ArrayList<T>(); ArrayList<T> subset = new ArrayList<T>();

View File

@ -4,6 +4,7 @@ import sys
def chopped_line_generator(filename): def chopped_line_generator(filename):
fin = open(filename) fin = open(filename)
fin.readline() # pull off header
for line in fin: for line in fin:
line = line.rstrip() line = line.rstrip()
yield line yield line
@ -24,16 +25,18 @@ Output:
locus_chunk = [] locus_chunk = []
last_key = "" last_key = ""
first_line = True
for line in line_gen: for line in line_gen:
fields = line.split() fields = line.split()
key = subset_list_by_indices(key_fields, fields) key = subset_list_by_indices(key_fields, fields)
if key == last_key: if key == last_key or first_line:
locus_chunk.append(line) locus_chunk.append(line)
first_line = False
else: else:
last_key =key
if locus_chunk != []: if locus_chunk != []:
yield locus_chunk yield locus_chunk
locus_chunk = [] locus_chunk = [line]
last_key = key
yield locus_chunk yield locus_chunk
def chunk_stats(chunk): def chunk_stats(chunk):
@ -41,7 +44,7 @@ def chunk_stats(chunk):
correct_genotype = 0 correct_genotype = 0
for record in chunk: for record in chunk:
fields = record.split() fields = record.split()
if fields[0] == fields[8]: if fields[2] == fields[9]:
correct_genotype += 1 correct_genotype += 1
records += 1 records += 1
return float(correct_genotype) / records return float(correct_genotype) / records
@ -52,18 +55,18 @@ if __name__ == "__main__":
filename = sys.argv[1] filename = sys.argv[1]
fin = open(filename) fin = open(filename)
locus_gen = chunk_generator(chopped_line_generator(filename), (3,4)) locus_gen = chunk_generator(chopped_line_generator(filename), (4,5))
print "Fraction correct genotype\tCoverage sampled\tLocus\tReference base\tHapmap chip genotype (Max. coverage genotype call for reference calls)" print "Fraction correct genotype\tCoverage sampled\tLocus\tReference base\tHapmap chip genotype (Max. coverage genotype call for reference calls)"
for locus in locus_gen: for locus in locus_gen:
#print "NEW LOCUS" #print "NEW LOCUS"
covs = dict() covs = dict()
coverage_chunk_gen = chunk_generator(locus, (1,3,4)) coverage_chunk_gen = chunk_generator(locus, (0,4,5))
for cov_chunk in coverage_chunk_gen: for cov_chunk in coverage_chunk_gen:
#print "NEW COVERAGE" #print "NEW COVERAGE"
#print "\n".join(cov_chunk) #print "\n".join(cov_chunk)
fields = cov_chunk[0].split() fields = cov_chunk[0].split()
coverage = fields[1] coverage = fields[1]
print "\t".join(map(str,("%.2f"%chunk_stats(cov_chunk), coverage, fields[3]+":"+fields[4],fields[5],fields[0]))) print "\t".join(map(str,("%.2f"%chunk_stats(cov_chunk), coverage, fields[4]+":"+fields[5],fields[6],fields[2])))
#covs[coverage] = cov_chunk #covs[coverage] = cov_chunk