Added ParseDCCSequenceData.py to repository and made changes that allow an analysis of quantity of sequence data by platform and project, moved table / record system to a new module called FlatFileTable.py and built that into ParseDCCSequenceData and CoverageEval.py; changed lod threshold in CoverageEvalWalker.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1201 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
andrewk 2009-07-08 22:04:26 +00:00
parent 3f0304de5a
commit c8fcecbc6f
4 changed files with 116 additions and 32 deletions

View File

@ -88,7 +88,7 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
LocusContext subContext = new LocusContext(context.getLocation(), sub_reads, sub_offsets); LocusContext subContext = new LocusContext(context.getLocation(), sub_reads, sub_offsets);
AlleleFrequencyEstimate alleleFreq = SSG.map(tracker, ref, subContext); AlleleFrequencyEstimate alleleFreq = SSG.map(tracker, ref, subContext);
if (alleleFreq != null && (alleleFreq.lodVsRef >= LOD_THRESHOLD || alleleFreq.lodVsRef <= LOD_THRESHOLD)) { if (alleleFreq != null) {
GenotypeCalls.add(coverage+"\t"+coverage_available+"\t"+hc_genotype+"\t"+alleleFreq.callType()+"\t"+alleleFreq.asGeliString()); GenotypeCalls.add(coverage+"\t"+coverage_available+"\t"+hc_genotype+"\t"+alleleFreq.callType()+"\t"+alleleFreq.asGeliString());
} }
} }

View File

@ -1,13 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys import sys, itertools, FlatFileTable
def chopped_line_generator(filename):
fin = open(filename)
fin.readline() # pull off header
for line in fin:
line = line.rstrip()
yield line
def subset_list_by_indices(indices, list): def subset_list_by_indices(indices, list):
subset = [] subset = []
@ -15,59 +8,56 @@ def subset_list_by_indices(indices, list):
subset.append(list[index]) subset.append(list[index])
return subset return subset
def chunk_generator(line_gen, key_fields): def chunk_generator(record_gen, key_fields):
"""Input: """Input:
line_gen: generator that produces lines with linefeeds chopped off line_gen: generator that produces lines with linefeeds chopped off
key_fields: field numbers in each record used to determine chunk membership key_fields: field numbers in each record used to determine chunk membership
Output: Output:
locus_chunk: list of consecutive lines that have the same key_fields locus_chunk: list of consecutive lines that have the same key_fields"""
"""
locus_chunk = [] locus_chunk = []
last_key = "" last_key = ""
first_line = True first_record = True
for line in line_gen: for record in record_gen:
fields = line.split() key = [record[f] for f in key_fields]
key = subset_list_by_indices(key_fields, fields) if key == last_key or first_record:
if key == last_key or first_line: locus_chunk.append(record)
locus_chunk.append(line) first_record = False
first_line = False
else: else:
if locus_chunk != []: if locus_chunk != []:
yield locus_chunk yield locus_chunk
locus_chunk = [line] locus_chunk = [record]
last_key = key last_key = key
yield locus_chunk yield locus_chunk
def chunk_stats(chunk): def chunk_stats(chunk):
records = 0 records = 0
conf_calls = 0
correct_genotype = 0 correct_genotype = 0
for record in chunk: for record in chunk:
fields = record.split() if abs(float(record["BtnbLod"])) >= 5:
if fields[2] == fields[9]: conf_calls += 1
correct_genotype += 1 if record["HapmapChipGenotype"] == record["BestGenotype"]:
correct_genotype += 1
records += 1 records += 1
return float(correct_genotype) / records
return float(correct_genotype) / max(conf_calls,1), float(conf_calls) / max(records,1)
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv) < 2: if len(sys.argv) < 2:
sys.exit("Usage: CoverageEval.py geli_file") sys.exit("Usage: CoverageEval.py geli_file")
filename = sys.argv[1] filename = sys.argv[1]
fin = open(filename) locus_gen = chunk_generator(FlatFileTable.record_generator(filename, None), ("Sequence","Position"))
locus_gen = chunk_generator(chopped_line_generator(filename), (4,5))
print "Fraction correct genotype\tCoverage sampled\tLocus\tReference base\tHapmap chip genotype (Max. coverage genotype call for reference calls)" print "Fraction correct genotype\tCoverage sampled\tLocus\tReference base\tHapmap chip genotype (Max. coverage genotype call for reference calls)"
for locus in locus_gen: for locus in locus_gen:
#print "NEW LOCUS" #print "NEW LOCUS"
covs = dict() covs = dict()
coverage_chunk_gen = chunk_generator(locus, (0,4,5)) coverage_chunk_gen = chunk_generator(locus, ("DownsampledCoverage", "Sequence", "Position"))
for cov_chunk in coverage_chunk_gen: for cov_chunk in coverage_chunk_gen:
#print "NEW COVERAGE" #print "NEW COVERAGE"
#print "\n".join(cov_chunk) #print "\n".join(cov_chunk)
fields = cov_chunk[0].split() record = cov_chunk[0]
coverage = fields[1] print "\t".join(map(str,("%.4f\t%.4f"%chunk_stats(cov_chunk), record["DownsampledCoverage"], record["Sequence"]+":"+record["Position"],record["ReferenceBase"],record["HapmapChipGenotype"])))
print "\t".join(map(str,("%.2f"%chunk_stats(cov_chunk), coverage, fields[4]+":"+fields[5],fields[6],fields[2])))
#covs[coverage] = cov_chunk

View File

@ -0,0 +1,19 @@
#!/usr/bin/env python
import sys, itertools
def record_generator(filename, sep="\t"):
"""Given a file with field headers on the first line and records on subsequent lines,
generates a dictionary for each line keyed by the header fields"""
fin = open(filename)
header = fin.readline().rstrip().split() # pull off header
for line in fin:
fields = line.rstrip().split(sep)
record = dict(itertools.izip(header, fields))
yield record
def record_matches_values(record, match_field_values):
for match_field, match_values in match_field_values:
if record[match_field] not in match_values:
return False
return True

View File

@ -0,0 +1,75 @@
#!/usr/bin/env python
import operator, FlatFileTable
class db_file:
filename = ""
def __init__(self, filenm):
self.filename = filenm
def count_fields(self, fixed_field, fixed_field_values, count_field):
record_gen = FlatFileTable.record_generator(self.filename)
counts = dict()
#fixed_field_num = self.field_names[fixed_field]
print count_field+" for "+fixed_field+" = "+" or ".join(fixed_field_values)
for record in record_gen:
if record[fixed_field] in fixed_field_values:
#fixed_field_value = fields[fixed_field_num]
count_field_num = record[count_field]
if counts.has_key(count_field_num):
counts[count_field_num] += 1
else:
counts[count_field_num] = 0
for k,v in sorted(counts.items(), key=operator.itemgetter(1), cmp=lambda x,y: y-x ):
print str(k)+"\t"+str(v)
def count_bases(self, fixed_field_values): #, fixed_field_values):
record_gen = FlatFileTable.record_generator(self.filename)
base_count = 0
#fixed_field_num = self.field_names[fixed_field]
#print "For "+fixed_field+" = "+" or ".join(fixed_field_values)+":",
print "For "+ " AND ".join( [one_ffv[0]+" = "+" OR ".join(one_ffv[1]) for one_ffv in fixed_field_values] )
for record in record_gen:
#if record[fixed_field] in fixed_field_values:
if FlatFileTable.record_matches_values(record, fixed_field_values):
try:
base_count += int(record["BASE_COUNT"])
except ValueError:
pass
print "%e bases" % base_count
if __name__ == "__main__":
db = db_file("sequence.index")
platforms = (("ILLUMINA",), ("AB SOLiD","SOLID","ABI_SOLID","AB SOLiD System 2.0"), ("LS454",))
studies = (("1000Genomes Project Pilot 1",), ("1000Genomes Project Pilot 2",), ("1000Genomes Project Pilot 3",))
for select_field, select_field_values in (): #(("INSTRUMENT_PLATFORM", platforms), ("STUDY_NAME", studies)):
for count_field in ("CENTER_NAME", "STUDY_NAME", "INSTRUMENT_PLATFORM"):
for select_field_value in select_field_values:
db.count_fields(select_field, select_field_value, count_field)
for select_field_value in select_field_values:
db.count_bases(((select_field, select_field_value),))
print
for field1, value1 in zip(["INSTRUMENT_PLATFORM"]*len(platforms), platforms):
for field2, value2 in zip(["STUDY_NAME"]*len(studies), studies):
db.count_bases(((field1, value1), (field2, value2)))