Added unit tests for function in ListUtils to randomly sample lists with replacement, updated AlleleFrequencyEstimate to provide a callType of HomRef, HetSNP, HomSNP, update indices in CoverageEval.py, and made a lot of changes to CoverageWalker biggest one being that it directly calls SingleSampleGenotyper instead of implementing some parts of SSG itself.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1189 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4ba2194b5e
commit
d3daecfc4d
|
|
@ -1,8 +1,6 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.playground.utils.IndelLikelihood;
|
||||
import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods;
|
||||
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodGFF;
|
||||
|
|
@ -31,10 +29,19 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
|
|||
@Argument(fullName="format_geli", shortName="geli", doc="Output variant calls in Geli/Picard format", required=false) public boolean GELI_OUTPUT_FORMAT = false;
|
||||
|
||||
@Argument(fullName="variants_out", shortName="varout", doc="File to which variants should be written", required=true) public File VARIANTS_FILE;
|
||||
@Argument(fullName="min_coverage", shortName="mincov", doc="Mininum coverage to downsample to", required=false) public int min_coverage=1;
|
||||
@Argument(fullName="max_coverage", shortName="maxcov", doc="Maximum coverage to downsample to", required=false) public int max_coverage=20;
|
||||
@Argument(fullName="downsampling_repeats", shortName="repeat", doc="Number of times to repeat downsampling at each coverage level", required=false) public int downsampling_repeats=20;
|
||||
|
||||
public PrintStream variantsOut;
|
||||
|
||||
SingleSampleGenotyper SSG;
|
||||
|
||||
public void initialize() {
|
||||
SSG = new SingleSampleGenotyper();
|
||||
SSG.VARIANTS_FILE = VARIANTS_FILE;
|
||||
SSG.initialize();
|
||||
|
||||
try {
|
||||
variantsOut = new PrintStream(VARIANTS_FILE);
|
||||
} catch (FileNotFoundException e) {
|
||||
|
|
@ -43,7 +50,7 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
|
|||
}
|
||||
|
||||
String header = GELI_OUTPUT_FORMAT ? AlleleFrequencyEstimate.geliHeaderString() : AlleleFrequencyEstimate.asTabularStringHeader();
|
||||
variantsOut.println("#DownsampledCoverage\tAvailableCoveragt \t"+header);
|
||||
variantsOut.println("DownsampledCoverage\tAvailableCoverage\tHapmapChipGenotype\tGenotypeCallType\t"+header.substring(1));
|
||||
}
|
||||
|
||||
public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
|
|
@ -51,69 +58,45 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
|
|||
}
|
||||
|
||||
public List<String> map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
|
||||
rodGFF hapmap_chip = (rodGFF)tracker.lookup("hapmap-chip", null);
|
||||
String hc_genotype;
|
||||
|
||||
if (hapmap_chip != null) {
|
||||
hc_genotype = hapmap_chip.getFeature();
|
||||
}else{
|
||||
hc_genotype = new String(new char[] {ref, ref});
|
||||
}
|
||||
|
||||
//if (tracker.hasROD("hapmap-chip")) {
|
||||
ArrayList<String> Gs = new ArrayList<String>();
|
||||
ArrayList<String> GenotypeCalls = new ArrayList<String>();
|
||||
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
|
||||
String bases = pileup.getBases();
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
|
||||
// Iterate over coverage levels
|
||||
int coverage_available = reads.size();
|
||||
int coverage_levels[] = {4, 10, 20, Integer.MAX_VALUE};
|
||||
int downsampling_repeats = 10; // number of times to random re-sample each coverage_level
|
||||
for (int coverage : coverage_levels) {
|
||||
coverage = Math.min(coverage_available, coverage); // don't exceed max available coverage
|
||||
for (int r=0; r<downsampling_repeats; r++) {
|
||||
List<Integer> subset_indices = ListUtils.randomSubsetIndices(coverage, coverage_available);
|
||||
List<SAMRecord> sub_reads = ListUtils.subsetListByIndices(subset_indices, reads);
|
||||
List<Integer> sub_offsets = ListUtils.subsetListByIndices(subset_indices, offsets);
|
||||
|
||||
// Call genotypes on subset of reads and offsets
|
||||
GenotypeLikelihoods G = callGenotype(tracker, ref, pileup, sub_reads, sub_offsets);
|
||||
String geliString = G.toAlleleFrequencyEstimate(context.getLocation(), ref, bases.length(), bases, G.likelihoods, "sample").asGeliString();
|
||||
|
||||
Gs.add(hc_genotype+"\t"+coverage+"\t"+coverage_available+"\t"+geliString);
|
||||
int coverage_available = reads.size();
|
||||
List<Integer> coverage_levels = new ArrayList<Integer>();// = {4, 7, 10, 20, Integer.MAX_VALUE};
|
||||
for (int coverage = min_coverage; coverage <= max_coverage; coverage++) {
|
||||
coverage_levels.add(coverage);
|
||||
}
|
||||
coverage_levels.add(coverage_available); // Run on all available reads
|
||||
|
||||
// Iterate over coverage levels
|
||||
for (int coverage : coverage_levels) {
|
||||
coverage = Math.min(coverage_available, coverage); // don't exceed max available coverage
|
||||
for (int r=0; r<downsampling_repeats; r++) {
|
||||
List<Integer> subset_indices = ListUtils.sampleIndicesWithReplacement(coverage, coverage_available);
|
||||
List<SAMRecord> sub_reads = ListUtils.sliceListByIndices(subset_indices, reads);
|
||||
List<Integer> sub_offsets = ListUtils.sliceListByIndices(subset_indices, offsets);
|
||||
|
||||
LocusContext subContext = new LocusContext(context.getLocation(), sub_reads, sub_offsets);
|
||||
AlleleFrequencyEstimate alleleFreq = SSG.map(tracker, ref, subContext);
|
||||
|
||||
if (alleleFreq != null && (alleleFreq.lodVsRef >= LOD_THRESHOLD || alleleFreq.lodVsRef <= LOD_THRESHOLD)) {
|
||||
GenotypeCalls.add(coverage+"\t"+coverage_available+"\t"+hc_genotype+"\t"+alleleFreq.callType()+"\t"+alleleFreq.asGeliString());
|
||||
}
|
||||
}
|
||||
}
|
||||
return GenotypeCalls;
|
||||
}else{
|
||||
return new ArrayList<String>();
|
||||
}
|
||||
|
||||
return Gs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls the underlying, single locus genotype of the sample
|
||||
*
|
||||
* @param tracker the meta data tracker
|
||||
* @param ref the reference base
|
||||
* @param pileup the pileup object for the given locus
|
||||
* @param reads the reads that overlap this locus
|
||||
* @param offsets the offsets per read that identify the base at this locus
|
||||
* @return the likelihoods per genotype
|
||||
*/
|
||||
private GenotypeLikelihoods callGenotype(RefMetaDataTracker tracker, char ref, ReadBackedPileup pileup, List<SAMRecord> reads, List<Integer> offsets) {
|
||||
GenotypeLikelihoods G;
|
||||
|
||||
G = new GenotypeLikelihoods();
|
||||
|
||||
for ( int i = 0; i < reads.size(); i++ ) {
|
||||
SAMRecord read = reads.get(i);
|
||||
int offset = offsets.get(i);
|
||||
|
||||
G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]);
|
||||
}
|
||||
|
||||
G.ApplyPrior(ref, 'N', -1);
|
||||
|
||||
return G;
|
||||
}
|
||||
|
||||
public String reduceInit() {
|
||||
|
|
@ -121,8 +104,6 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
|
|||
}
|
||||
|
||||
public String reduce(List<String> alleleFreqLines, String sum) {
|
||||
//GenomeLoc a = GenomeLocParser.parseGenomeLoc("chr1:42971309");
|
||||
//if ((alleleFreq != null && alleleFreq.lodVsRef >= LOD_THRESHOLD)) { // || (alleleFreq.location == a) ) {
|
||||
for (String line : alleleFreqLines) {
|
||||
variantsOut.println(line);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -248,4 +248,14 @@ public class AlleleFrequencyEstimate {
|
|||
{
|
||||
return this.posteriors[(int)this.qstar * this.N];
|
||||
}
|
||||
|
||||
public String callType() {
|
||||
// Returns a string indicating whether the call is homozygous reference, heterozygous SNP, or homozygous SNP
|
||||
|
||||
String[] callTypeString = {"HomozygousSNP", "HeterozygousSNP", "HomozygousReference"};
|
||||
String genotype = genotype();
|
||||
int ref_matches = (genotype.charAt(0) == ref ? 1 : 0) + (genotype.charAt(1) == ref ? 1 : 0);
|
||||
return callTypeString[ref_matches];
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -16,13 +16,8 @@ public class ListUtils {
|
|||
|
||||
static Random rand = new Random(12321); //System.currentTimeMillis());
|
||||
|
||||
static public ArrayList<Integer> randomSubsetIndices(int n, int k) {
|
||||
static public ArrayList<Integer> sampleIndicesWithReplacement(int n, int k) {
|
||||
// Returns n random indices drawn with replacement from the range 1..k
|
||||
|
||||
/*ArrayList<Integer> balls = new ArrayList<Integer>();
|
||||
for (int i=0; i<k; i++) {
|
||||
balls.add(i);
|
||||
} */
|
||||
|
||||
ArrayList<Integer> chosen_balls = new ArrayList <Integer>();
|
||||
for (int i=0; i<n; i++) {
|
||||
|
|
@ -34,8 +29,9 @@ public class ListUtils {
|
|||
return chosen_balls;
|
||||
}
|
||||
|
||||
static public <T> ArrayList<T> subsetListByIndices(List<Integer> indices, List<T> list) {
|
||||
// Given a list of indices into a list, return those elements of the list list
|
||||
static public <T> ArrayList<T> sliceListByIndices(List<Integer> indices, List<T> list) {
|
||||
// Given a list of indices into a list, return those elements of the list with the possibility
|
||||
// of drawing list elements multiple times
|
||||
|
||||
ArrayList<T> subset = new ArrayList<T>();
|
||||
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import sys
|
|||
|
||||
def chopped_line_generator(filename):
|
||||
fin = open(filename)
|
||||
fin.readline() # pull off header
|
||||
for line in fin:
|
||||
line = line.rstrip()
|
||||
yield line
|
||||
|
|
@ -24,16 +25,18 @@ Output:
|
|||
|
||||
locus_chunk = []
|
||||
last_key = ""
|
||||
first_line = True
|
||||
for line in line_gen:
|
||||
fields = line.split()
|
||||
key = subset_list_by_indices(key_fields, fields)
|
||||
if key == last_key:
|
||||
if key == last_key or first_line:
|
||||
locus_chunk.append(line)
|
||||
first_line = False
|
||||
else:
|
||||
last_key =key
|
||||
if locus_chunk != []:
|
||||
yield locus_chunk
|
||||
locus_chunk = []
|
||||
locus_chunk = [line]
|
||||
last_key = key
|
||||
yield locus_chunk
|
||||
|
||||
def chunk_stats(chunk):
|
||||
|
|
@ -41,7 +44,7 @@ def chunk_stats(chunk):
|
|||
correct_genotype = 0
|
||||
for record in chunk:
|
||||
fields = record.split()
|
||||
if fields[0] == fields[8]:
|
||||
if fields[2] == fields[9]:
|
||||
correct_genotype += 1
|
||||
records += 1
|
||||
return float(correct_genotype) / records
|
||||
|
|
@ -52,18 +55,18 @@ if __name__ == "__main__":
|
|||
filename = sys.argv[1]
|
||||
|
||||
fin = open(filename)
|
||||
locus_gen = chunk_generator(chopped_line_generator(filename), (3,4))
|
||||
locus_gen = chunk_generator(chopped_line_generator(filename), (4,5))
|
||||
print "Fraction correct genotype\tCoverage sampled\tLocus\tReference base\tHapmap chip genotype (Max. coverage genotype call for reference calls)"
|
||||
for locus in locus_gen:
|
||||
#print "NEW LOCUS"
|
||||
covs = dict()
|
||||
coverage_chunk_gen = chunk_generator(locus, (1,3,4))
|
||||
coverage_chunk_gen = chunk_generator(locus, (0,4,5))
|
||||
for cov_chunk in coverage_chunk_gen:
|
||||
#print "NEW COVERAGE"
|
||||
#print "\n".join(cov_chunk)
|
||||
fields = cov_chunk[0].split()
|
||||
coverage = fields[1]
|
||||
print "\t".join(map(str,("%.2f"%chunk_stats(cov_chunk), coverage, fields[3]+":"+fields[4],fields[5],fields[0])))
|
||||
print "\t".join(map(str,("%.2f"%chunk_stats(cov_chunk), coverage, fields[4]+":"+fields[5],fields[6],fields[2])))
|
||||
|
||||
#covs[coverage] = cov_chunk
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue