Added unit tests for function in ListUtils to randomly sample lists with replacement, updated AlleleFrequencyEstimate to provide a callType of HomRef, HetSNP, HomSNP, update indices in CoverageEval.py, and made a lot of changes to CoverageWalker biggest one being that it directly calls SingleSampleGenotyper instead of implementing some parts of SSG itself.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1189 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
andrewk 2009-07-08 02:05:40 +00:00
parent 4ba2194b5e
commit d3daecfc4d
4 changed files with 64 additions and 74 deletions

View File

@ -1,8 +1,6 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.playground.utils.IndelLikelihood;
import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods;
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodGFF;
@ -31,10 +29,19 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
@Argument(fullName="format_geli", shortName="geli", doc="Output variant calls in Geli/Picard format", required=false) public boolean GELI_OUTPUT_FORMAT = false;
@Argument(fullName="variants_out", shortName="varout", doc="File to which variants should be written", required=true) public File VARIANTS_FILE;
@Argument(fullName="min_coverage", shortName="mincov", doc="Mininum coverage to downsample to", required=false) public int min_coverage=1;
@Argument(fullName="max_coverage", shortName="maxcov", doc="Maximum coverage to downsample to", required=false) public int max_coverage=20;
@Argument(fullName="downsampling_repeats", shortName="repeat", doc="Number of times to repeat downsampling at each coverage level", required=false) public int downsampling_repeats=20;
public PrintStream variantsOut;
SingleSampleGenotyper SSG;
public void initialize() {
SSG = new SingleSampleGenotyper();
SSG.VARIANTS_FILE = VARIANTS_FILE;
SSG.initialize();
try {
variantsOut = new PrintStream(VARIANTS_FILE);
} catch (FileNotFoundException e) {
@ -43,7 +50,7 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
}
String header = GELI_OUTPUT_FORMAT ? AlleleFrequencyEstimate.geliHeaderString() : AlleleFrequencyEstimate.asTabularStringHeader();
variantsOut.println("#DownsampledCoverage\tAvailableCoveragt \t"+header);
variantsOut.println("DownsampledCoverage\tAvailableCoverage\tHapmapChipGenotype\tGenotypeCallType\t"+header.substring(1));
}
public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) {
@ -51,69 +58,45 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
}
public List<String> map(RefMetaDataTracker tracker, char ref, LocusContext context) {
rodGFF hapmap_chip = (rodGFF)tracker.lookup("hapmap-chip", null);
String hc_genotype;
if (hapmap_chip != null) {
hc_genotype = hapmap_chip.getFeature();
}else{
hc_genotype = new String(new char[] {ref, ref});
}
//if (tracker.hasROD("hapmap-chip")) {
ArrayList<String> Gs = new ArrayList<String>();
ArrayList<String> GenotypeCalls = new ArrayList<String>();
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
String bases = pileup.getBases();
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
// Iterate over coverage levels
int coverage_available = reads.size();
int coverage_levels[] = {4, 10, 20, Integer.MAX_VALUE};
int downsampling_repeats = 10; // number of times to random re-sample each coverage_level
for (int coverage : coverage_levels) {
coverage = Math.min(coverage_available, coverage); // don't exceed max available coverage
for (int r=0; r<downsampling_repeats; r++) {
List<Integer> subset_indices = ListUtils.randomSubsetIndices(coverage, coverage_available);
List<SAMRecord> sub_reads = ListUtils.subsetListByIndices(subset_indices, reads);
List<Integer> sub_offsets = ListUtils.subsetListByIndices(subset_indices, offsets);
// Call genotypes on subset of reads and offsets
GenotypeLikelihoods G = callGenotype(tracker, ref, pileup, sub_reads, sub_offsets);
String geliString = G.toAlleleFrequencyEstimate(context.getLocation(), ref, bases.length(), bases, G.likelihoods, "sample").asGeliString();
Gs.add(hc_genotype+"\t"+coverage+"\t"+coverage_available+"\t"+geliString);
int coverage_available = reads.size();
List<Integer> coverage_levels = new ArrayList<Integer>();// = {4, 7, 10, 20, Integer.MAX_VALUE};
for (int coverage = min_coverage; coverage <= max_coverage; coverage++) {
coverage_levels.add(coverage);
}
coverage_levels.add(coverage_available); // Run on all available reads
// Iterate over coverage levels
for (int coverage : coverage_levels) {
coverage = Math.min(coverage_available, coverage); // don't exceed max available coverage
for (int r=0; r<downsampling_repeats; r++) {
List<Integer> subset_indices = ListUtils.sampleIndicesWithReplacement(coverage, coverage_available);
List<SAMRecord> sub_reads = ListUtils.sliceListByIndices(subset_indices, reads);
List<Integer> sub_offsets = ListUtils.sliceListByIndices(subset_indices, offsets);
LocusContext subContext = new LocusContext(context.getLocation(), sub_reads, sub_offsets);
AlleleFrequencyEstimate alleleFreq = SSG.map(tracker, ref, subContext);
if (alleleFreq != null && (alleleFreq.lodVsRef >= LOD_THRESHOLD || alleleFreq.lodVsRef <= LOD_THRESHOLD)) {
GenotypeCalls.add(coverage+"\t"+coverage_available+"\t"+hc_genotype+"\t"+alleleFreq.callType()+"\t"+alleleFreq.asGeliString());
}
}
}
return GenotypeCalls;
}else{
return new ArrayList<String>();
}
return Gs;
}
/**
* Calls the underlying, single locus genotype of the sample
*
* @param tracker the meta data tracker
* @param ref the reference base
* @param pileup the pileup object for the given locus
* @param reads the reads that overlap this locus
* @param offsets the offsets per read that identify the base at this locus
* @return the likelihoods per genotype
*/
private GenotypeLikelihoods callGenotype(RefMetaDataTracker tracker, char ref, ReadBackedPileup pileup, List<SAMRecord> reads, List<Integer> offsets) {
GenotypeLikelihoods G;
G = new GenotypeLikelihoods();
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]);
}
G.ApplyPrior(ref, 'N', -1);
return G;
}
public String reduceInit() {
@ -121,8 +104,6 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
}
public String reduce(List<String> alleleFreqLines, String sum) {
//GenomeLoc a = GenomeLocParser.parseGenomeLoc("chr1:42971309");
//if ((alleleFreq != null && alleleFreq.lodVsRef >= LOD_THRESHOLD)) { // || (alleleFreq.location == a) ) {
for (String line : alleleFreqLines) {
variantsOut.println(line);
}

View File

@ -248,4 +248,14 @@ public class AlleleFrequencyEstimate {
{
return this.posteriors[(int)this.qstar * this.N];
}
public String callType() {
// Returns a string indicating whether the call is homozygous reference, heterozygous SNP, or homozygous SNP
String[] callTypeString = {"HomozygousSNP", "HeterozygousSNP", "HomozygousReference"};
String genotype = genotype();
int ref_matches = (genotype.charAt(0) == ref ? 1 : 0) + (genotype.charAt(1) == ref ? 1 : 0);
return callTypeString[ref_matches];
}
}

View File

@ -16,13 +16,8 @@ public class ListUtils {
static Random rand = new Random(12321); //System.currentTimeMillis());
static public ArrayList<Integer> randomSubsetIndices(int n, int k) {
static public ArrayList<Integer> sampleIndicesWithReplacement(int n, int k) {
// Returns n random indices drawn with replacement from the range 1..k
/*ArrayList<Integer> balls = new ArrayList<Integer>();
for (int i=0; i<k; i++) {
balls.add(i);
} */
ArrayList<Integer> chosen_balls = new ArrayList <Integer>();
for (int i=0; i<n; i++) {
@ -34,8 +29,9 @@ public class ListUtils {
return chosen_balls;
}
static public <T> ArrayList<T> subsetListByIndices(List<Integer> indices, List<T> list) {
// Given a list of indices into a list, return those elements of the list list
static public <T> ArrayList<T> sliceListByIndices(List<Integer> indices, List<T> list) {
// Given a list of indices into a list, return those elements of the list with the possibility
// of drawing list elements multiple times
ArrayList<T> subset = new ArrayList<T>();

View File

@ -4,6 +4,7 @@ import sys
def chopped_line_generator(filename):
fin = open(filename)
fin.readline() # pull off header
for line in fin:
line = line.rstrip()
yield line
@ -24,16 +25,18 @@ Output:
locus_chunk = []
last_key = ""
first_line = True
for line in line_gen:
fields = line.split()
key = subset_list_by_indices(key_fields, fields)
if key == last_key:
if key == last_key or first_line:
locus_chunk.append(line)
first_line = False
else:
last_key =key
if locus_chunk != []:
yield locus_chunk
locus_chunk = []
locus_chunk = [line]
last_key = key
yield locus_chunk
def chunk_stats(chunk):
@ -41,7 +44,7 @@ def chunk_stats(chunk):
correct_genotype = 0
for record in chunk:
fields = record.split()
if fields[0] == fields[8]:
if fields[2] == fields[9]:
correct_genotype += 1
records += 1
return float(correct_genotype) / records
@ -52,18 +55,18 @@ if __name__ == "__main__":
filename = sys.argv[1]
fin = open(filename)
locus_gen = chunk_generator(chopped_line_generator(filename), (3,4))
locus_gen = chunk_generator(chopped_line_generator(filename), (4,5))
print "Fraction correct genotype\tCoverage sampled\tLocus\tReference base\tHapmap chip genotype (Max. coverage genotype call for reference calls)"
for locus in locus_gen:
#print "NEW LOCUS"
covs = dict()
coverage_chunk_gen = chunk_generator(locus, (1,3,4))
coverage_chunk_gen = chunk_generator(locus, (0,4,5))
for cov_chunk in coverage_chunk_gen:
#print "NEW COVERAGE"
#print "\n".join(cov_chunk)
fields = cov_chunk[0].split()
coverage = fields[1]
print "\t".join(map(str,("%.2f"%chunk_stats(cov_chunk), coverage, fields[3]+":"+fields[4],fields[5],fields[0])))
print "\t".join(map(str,("%.2f"%chunk_stats(cov_chunk), coverage, fields[4]+":"+fields[5],fields[6],fields[2])))
#covs[coverage] = cov_chunk