Added ParseDCCSequenceData.py to repository and made changes that allow an analysis of quantity of sequence data by platform and project, moved table / record system to a new module called FlatFileTable.py and built that into ParseDCCSequenceData and CoverageEval.py; changed lod threshold in CoverageEvalWalker.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1201 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
andrewk 2009-07-08 22:04:26 +00:00
parent 3f0304de5a
commit c8fcecbc6f
4 changed files with 116 additions and 32 deletions

View File

@ -88,7 +88,7 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
LocusContext subContext = new LocusContext(context.getLocation(), sub_reads, sub_offsets);
AlleleFrequencyEstimate alleleFreq = SSG.map(tracker, ref, subContext);
if (alleleFreq != null && (alleleFreq.lodVsRef >= LOD_THRESHOLD || alleleFreq.lodVsRef <= LOD_THRESHOLD)) {
if (alleleFreq != null) {
GenotypeCalls.add(coverage+"\t"+coverage_available+"\t"+hc_genotype+"\t"+alleleFreq.callType()+"\t"+alleleFreq.asGeliString());
}
}

View File

@ -1,13 +1,6 @@
#!/usr/bin/env python
import sys
def chopped_line_generator(filename):
fin = open(filename)
fin.readline() # pull off header
for line in fin:
line = line.rstrip()
yield line
import sys, itertools, FlatFileTable
def subset_list_by_indices(indices, list):
subset = []
@ -15,59 +8,56 @@ def subset_list_by_indices(indices, list):
subset.append(list[index])
return subset
def chunk_generator(line_gen, key_fields):
def chunk_generator(record_gen, key_fields):
"""Input:
line_gen: generator that produces lines with linefeeds chopped off
key_fields: field numbers in each record used to determine chunk membership
Output:
locus_chunk: list of consecutive lines that have the same key_fields
"""
locus_chunk: list of consecutive lines that have the same key_fields"""
locus_chunk = []
last_key = ""
first_line = True
for line in line_gen:
fields = line.split()
key = subset_list_by_indices(key_fields, fields)
if key == last_key or first_line:
locus_chunk.append(line)
first_line = False
first_record = True
for record in record_gen:
key = [record[f] for f in key_fields]
if key == last_key or first_record:
locus_chunk.append(record)
first_record = False
else:
if locus_chunk != []:
yield locus_chunk
locus_chunk = [line]
locus_chunk = [record]
last_key = key
yield locus_chunk
def chunk_stats(chunk):
records = 0
conf_calls = 0
correct_genotype = 0
for record in chunk:
fields = record.split()
if fields[2] == fields[9]:
correct_genotype += 1
if abs(float(record["BtnbLod"])) >= 5:
conf_calls += 1
if record["HapmapChipGenotype"] == record["BestGenotype"]:
correct_genotype += 1
records += 1
return float(correct_genotype) / records
return float(correct_genotype) / max(conf_calls,1), float(conf_calls) / max(records,1)
if __name__ == "__main__":
if len(sys.argv) < 2:
sys.exit("Usage: CoverageEval.py geli_file")
filename = sys.argv[1]
fin = open(filename)
locus_gen = chunk_generator(chopped_line_generator(filename), (4,5))
locus_gen = chunk_generator(FlatFileTable.record_generator(filename, None), ("Sequence","Position"))
print "Fraction correct genotype\tCoverage sampled\tLocus\tReference base\tHapmap chip genotype (Max. coverage genotype call for reference calls)"
for locus in locus_gen:
#print "NEW LOCUS"
covs = dict()
coverage_chunk_gen = chunk_generator(locus, (0,4,5))
coverage_chunk_gen = chunk_generator(locus, ("DownsampledCoverage", "Sequence", "Position"))
for cov_chunk in coverage_chunk_gen:
#print "NEW COVERAGE"
#print "\n".join(cov_chunk)
fields = cov_chunk[0].split()
coverage = fields[1]
print "\t".join(map(str,("%.2f"%chunk_stats(cov_chunk), coverage, fields[4]+":"+fields[5],fields[6],fields[2])))
#covs[coverage] = cov_chunk
record = cov_chunk[0]
print "\t".join(map(str,("%.4f\t%.4f"%chunk_stats(cov_chunk), record["DownsampledCoverage"], record["Sequence"]+":"+record["Position"],record["ReferenceBase"],record["HapmapChipGenotype"])))

View File

@ -0,0 +1,19 @@
#!/usr/bin/env python
import sys, itertools
def record_generator(filename, sep="\t"):
"""Given a file with field headers on the first line and records on subsequent lines,
generates a dictionary for each line keyed by the header fields"""
fin = open(filename)
header = fin.readline().rstrip().split() # pull off header
for line in fin:
fields = line.rstrip().split(sep)
record = dict(itertools.izip(header, fields))
yield record
def record_matches_values(record, match_field_values):
for match_field, match_values in match_field_values:
if record[match_field] not in match_values:
return False
return True

View File

@ -0,0 +1,75 @@
#!/usr/bin/env python
import operator, FlatFileTable
class db_file:
filename = ""
def __init__(self, filenm):
self.filename = filenm
def count_fields(self, fixed_field, fixed_field_values, count_field):
record_gen = FlatFileTable.record_generator(self.filename)
counts = dict()
#fixed_field_num = self.field_names[fixed_field]
print count_field+" for "+fixed_field+" = "+" or ".join(fixed_field_values)
for record in record_gen:
if record[fixed_field] in fixed_field_values:
#fixed_field_value = fields[fixed_field_num]
count_field_num = record[count_field]
if counts.has_key(count_field_num):
counts[count_field_num] += 1
else:
counts[count_field_num] = 0
for k,v in sorted(counts.items(), key=operator.itemgetter(1), cmp=lambda x,y: y-x ):
print str(k)+"\t"+str(v)
def count_bases(self, fixed_field_values): #, fixed_field_values):
record_gen = FlatFileTable.record_generator(self.filename)
base_count = 0
#fixed_field_num = self.field_names[fixed_field]
#print "For "+fixed_field+" = "+" or ".join(fixed_field_values)+":",
print "For "+ " AND ".join( [one_ffv[0]+" = "+" OR ".join(one_ffv[1]) for one_ffv in fixed_field_values] )
for record in record_gen:
#if record[fixed_field] in fixed_field_values:
if FlatFileTable.record_matches_values(record, fixed_field_values):
try:
base_count += int(record["BASE_COUNT"])
except ValueError:
pass
print "%e bases" % base_count
if __name__ == "__main__":
db = db_file("sequence.index")
platforms = (("ILLUMINA",), ("AB SOLiD","SOLID","ABI_SOLID","AB SOLiD System 2.0"), ("LS454",))
studies = (("1000Genomes Project Pilot 1",), ("1000Genomes Project Pilot 2",), ("1000Genomes Project Pilot 3",))
for select_field, select_field_values in (): #(("INSTRUMENT_PLATFORM", platforms), ("STUDY_NAME", studies)):
for count_field in ("CENTER_NAME", "STUDY_NAME", "INSTRUMENT_PLATFORM"):
for select_field_value in select_field_values:
db.count_fields(select_field, select_field_value, count_field)
for select_field_value in select_field_values:
db.count_bases(((select_field, select_field_value),))
print
for field1, value1 in zip(["INSTRUMENT_PLATFORM"]*len(platforms), platforms):
for field2, value2 in zip(["STUDY_NAME"]*len(studies), studies):
db.count_bases(((field1, value1), (field2, value2)))