2009-07-03 16:07:02 +08:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
|
2009-07-09 06:04:26 +08:00
|
|
|
import sys, itertools, FlatFileTable
|
2009-07-03 16:07:02 +08:00
|
|
|
|
|
|
|
|
def subset_list_by_indices(indices, list):
|
|
|
|
|
subset = []
|
|
|
|
|
for index in indices:
|
|
|
|
|
subset.append(list[index])
|
|
|
|
|
return subset
|
|
|
|
|
|
2009-07-09 06:04:26 +08:00
|
|
|
def chunk_generator(record_gen, key_fields):
|
2009-07-03 16:07:02 +08:00
|
|
|
"""Input:
|
|
|
|
|
line_gen: generator that produces lines with linefeeds chopped off
|
|
|
|
|
key_fields: field numbers in each record used to determine chunk membership
|
|
|
|
|
Output:
|
2009-07-09 06:04:26 +08:00
|
|
|
locus_chunk: list of consecutive lines that have the same key_fields"""
|
2009-07-03 16:07:02 +08:00
|
|
|
|
|
|
|
|
locus_chunk = []
|
|
|
|
|
last_key = ""
|
2009-07-09 06:04:26 +08:00
|
|
|
first_record = True
|
|
|
|
|
for record in record_gen:
|
|
|
|
|
key = [record[f] for f in key_fields]
|
|
|
|
|
if key == last_key or first_record:
|
|
|
|
|
locus_chunk.append(record)
|
|
|
|
|
first_record = False
|
2009-07-03 16:07:02 +08:00
|
|
|
else:
|
|
|
|
|
if locus_chunk != []:
|
|
|
|
|
yield locus_chunk
|
2009-07-09 06:04:26 +08:00
|
|
|
locus_chunk = [record]
|
2009-07-08 10:05:40 +08:00
|
|
|
last_key = key
|
2009-07-03 16:07:02 +08:00
|
|
|
yield locus_chunk
|
|
|
|
|
|
|
|
|
|
def chunk_stats(chunk):
|
|
|
|
|
records = 0
|
2009-07-09 06:04:26 +08:00
|
|
|
conf_calls = 0
|
2009-07-03 16:07:02 +08:00
|
|
|
correct_genotype = 0
|
|
|
|
|
for record in chunk:
|
2009-07-09 06:04:26 +08:00
|
|
|
if abs(float(record["BtnbLod"])) >= 5:
|
|
|
|
|
conf_calls += 1
|
|
|
|
|
if record["HapmapChipGenotype"] == record["BestGenotype"]:
|
|
|
|
|
correct_genotype += 1
|
2009-07-03 16:07:02 +08:00
|
|
|
records += 1
|
2009-07-09 06:04:26 +08:00
|
|
|
|
|
|
|
|
return float(correct_genotype) / max(conf_calls,1), float(conf_calls) / max(records,1)
|
2009-07-03 16:07:02 +08:00
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
if len(sys.argv) < 2:
|
|
|
|
|
sys.exit("Usage: CoverageEval.py geli_file")
|
|
|
|
|
filename = sys.argv[1]
|
|
|
|
|
|
2009-07-09 06:04:26 +08:00
|
|
|
locus_gen = chunk_generator(FlatFileTable.record_generator(filename, None), ("Sequence","Position"))
|
2009-07-03 16:07:02 +08:00
|
|
|
print "Fraction correct genotype\tCoverage sampled\tLocus\tReference base\tHapmap chip genotype (Max. coverage genotype call for reference calls)"
|
|
|
|
|
for locus in locus_gen:
|
|
|
|
|
#print "NEW LOCUS"
|
|
|
|
|
covs = dict()
|
2009-07-09 06:04:26 +08:00
|
|
|
coverage_chunk_gen = chunk_generator(locus, ("DownsampledCoverage", "Sequence", "Position"))
|
2009-07-03 16:07:02 +08:00
|
|
|
for cov_chunk in coverage_chunk_gen:
|
|
|
|
|
#print "NEW COVERAGE"
|
|
|
|
|
#print "\n".join(cov_chunk)
|
2009-07-09 06:04:26 +08:00
|
|
|
record = cov_chunk[0]
|
|
|
|
|
print "\t".join(map(str,("%.4f\t%.4f"%chunk_stats(cov_chunk), record["DownsampledCoverage"], record["Sequence"]+":"+record["Position"],record["ReferenceBase"],record["HapmapChipGenotype"])))
|
2009-07-03 16:07:02 +08:00
|
|
|
|
|
|
|
|
|