Added ParseDCCSequenceData.py to repository and made changes that allow an analysis of quantity of sequence data by platform and project, moved table / record system to a new module called FlatFileTable.py and built that into ParseDCCSequenceData and CoverageEval.py; changed lod threshold in CoverageEvalWalker.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1201 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
3f0304de5a
commit
c8fcecbc6f
|
|
@ -88,7 +88,7 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
|
|||
LocusContext subContext = new LocusContext(context.getLocation(), sub_reads, sub_offsets);
|
||||
AlleleFrequencyEstimate alleleFreq = SSG.map(tracker, ref, subContext);
|
||||
|
||||
if (alleleFreq != null && (alleleFreq.lodVsRef >= LOD_THRESHOLD || alleleFreq.lodVsRef <= LOD_THRESHOLD)) {
|
||||
if (alleleFreq != null) {
|
||||
GenotypeCalls.add(coverage+"\t"+coverage_available+"\t"+hc_genotype+"\t"+alleleFreq.callType()+"\t"+alleleFreq.asGeliString());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,13 +1,6 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import sys
|
||||
|
||||
def chopped_line_generator(filename):
|
||||
fin = open(filename)
|
||||
fin.readline() # pull off header
|
||||
for line in fin:
|
||||
line = line.rstrip()
|
||||
yield line
|
||||
import sys, itertools, FlatFileTable
|
||||
|
||||
def subset_list_by_indices(indices, list):
|
||||
subset = []
|
||||
|
|
@ -15,59 +8,56 @@ def subset_list_by_indices(indices, list):
|
|||
subset.append(list[index])
|
||||
return subset
|
||||
|
||||
def chunk_generator(line_gen, key_fields):
|
||||
def chunk_generator(record_gen, key_fields):
|
||||
"""Input:
|
||||
line_gen: generator that produces lines with linefeeds chopped off
|
||||
key_fields: field numbers in each record used to determine chunk membership
|
||||
Output:
|
||||
locus_chunk: list of consecutive lines that have the same key_fields
|
||||
"""
|
||||
locus_chunk: list of consecutive lines that have the same key_fields"""
|
||||
|
||||
locus_chunk = []
|
||||
last_key = ""
|
||||
first_line = True
|
||||
for line in line_gen:
|
||||
fields = line.split()
|
||||
key = subset_list_by_indices(key_fields, fields)
|
||||
if key == last_key or first_line:
|
||||
locus_chunk.append(line)
|
||||
first_line = False
|
||||
first_record = True
|
||||
for record in record_gen:
|
||||
key = [record[f] for f in key_fields]
|
||||
if key == last_key or first_record:
|
||||
locus_chunk.append(record)
|
||||
first_record = False
|
||||
else:
|
||||
if locus_chunk != []:
|
||||
yield locus_chunk
|
||||
locus_chunk = [line]
|
||||
locus_chunk = [record]
|
||||
last_key = key
|
||||
yield locus_chunk
|
||||
|
||||
def chunk_stats(chunk):
|
||||
records = 0
|
||||
conf_calls = 0
|
||||
correct_genotype = 0
|
||||
for record in chunk:
|
||||
fields = record.split()
|
||||
if fields[2] == fields[9]:
|
||||
correct_genotype += 1
|
||||
if abs(float(record["BtnbLod"])) >= 5:
|
||||
conf_calls += 1
|
||||
if record["HapmapChipGenotype"] == record["BestGenotype"]:
|
||||
correct_genotype += 1
|
||||
records += 1
|
||||
return float(correct_genotype) / records
|
||||
|
||||
return float(correct_genotype) / max(conf_calls,1), float(conf_calls) / max(records,1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
sys.exit("Usage: CoverageEval.py geli_file")
|
||||
filename = sys.argv[1]
|
||||
|
||||
fin = open(filename)
|
||||
locus_gen = chunk_generator(chopped_line_generator(filename), (4,5))
|
||||
locus_gen = chunk_generator(FlatFileTable.record_generator(filename, None), ("Sequence","Position"))
|
||||
print "Fraction correct genotype\tCoverage sampled\tLocus\tReference base\tHapmap chip genotype (Max. coverage genotype call for reference calls)"
|
||||
for locus in locus_gen:
|
||||
#print "NEW LOCUS"
|
||||
covs = dict()
|
||||
coverage_chunk_gen = chunk_generator(locus, (0,4,5))
|
||||
coverage_chunk_gen = chunk_generator(locus, ("DownsampledCoverage", "Sequence", "Position"))
|
||||
for cov_chunk in coverage_chunk_gen:
|
||||
#print "NEW COVERAGE"
|
||||
#print "\n".join(cov_chunk)
|
||||
fields = cov_chunk[0].split()
|
||||
coverage = fields[1]
|
||||
print "\t".join(map(str,("%.2f"%chunk_stats(cov_chunk), coverage, fields[4]+":"+fields[5],fields[6],fields[2])))
|
||||
|
||||
#covs[coverage] = cov_chunk
|
||||
record = cov_chunk[0]
|
||||
print "\t".join(map(str,("%.4f\t%.4f"%chunk_stats(cov_chunk), record["DownsampledCoverage"], record["Sequence"]+":"+record["Position"],record["ReferenceBase"],record["HapmapChipGenotype"])))
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,19 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import sys, itertools
|
||||
|
||||
def record_generator(filename, sep="\t"):
|
||||
"""Given a file with field headers on the first line and records on subsequent lines,
|
||||
generates a dictionary for each line keyed by the header fields"""
|
||||
fin = open(filename)
|
||||
header = fin.readline().rstrip().split() # pull off header
|
||||
for line in fin:
|
||||
fields = line.rstrip().split(sep)
|
||||
record = dict(itertools.izip(header, fields))
|
||||
yield record
|
||||
|
||||
def record_matches_values(record, match_field_values):
|
||||
for match_field, match_values in match_field_values:
|
||||
if record[match_field] not in match_values:
|
||||
return False
|
||||
return True
|
||||
|
|
@ -0,0 +1,75 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import operator, FlatFileTable
|
||||
|
||||
class db_file:
|
||||
filename = ""
|
||||
|
||||
def __init__(self, filenm):
|
||||
self.filename = filenm
|
||||
|
||||
def count_fields(self, fixed_field, fixed_field_values, count_field):
|
||||
record_gen = FlatFileTable.record_generator(self.filename)
|
||||
|
||||
counts = dict()
|
||||
#fixed_field_num = self.field_names[fixed_field]
|
||||
print count_field+" for "+fixed_field+" = "+" or ".join(fixed_field_values)
|
||||
|
||||
for record in record_gen:
|
||||
if record[fixed_field] in fixed_field_values:
|
||||
#fixed_field_value = fields[fixed_field_num]
|
||||
count_field_num = record[count_field]
|
||||
if counts.has_key(count_field_num):
|
||||
counts[count_field_num] += 1
|
||||
else:
|
||||
counts[count_field_num] = 0
|
||||
|
||||
for k,v in sorted(counts.items(), key=operator.itemgetter(1), cmp=lambda x,y: y-x ):
|
||||
print str(k)+"\t"+str(v)
|
||||
|
||||
def count_bases(self, fixed_field_values): #, fixed_field_values):
|
||||
record_gen = FlatFileTable.record_generator(self.filename)
|
||||
|
||||
base_count = 0
|
||||
#fixed_field_num = self.field_names[fixed_field]
|
||||
#print "For "+fixed_field+" = "+" or ".join(fixed_field_values)+":",
|
||||
print "For "+ " AND ".join( [one_ffv[0]+" = "+" OR ".join(one_ffv[1]) for one_ffv in fixed_field_values] )
|
||||
|
||||
for record in record_gen:
|
||||
#if record[fixed_field] in fixed_field_values:
|
||||
if FlatFileTable.record_matches_values(record, fixed_field_values):
|
||||
try:
|
||||
base_count += int(record["BASE_COUNT"])
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
print "%e bases" % base_count
|
||||
|
||||
if __name__ == "__main__":
|
||||
db = db_file("sequence.index")
|
||||
|
||||
platforms = (("ILLUMINA",), ("AB SOLiD","SOLID","ABI_SOLID","AB SOLiD System 2.0"), ("LS454",))
|
||||
studies = (("1000Genomes Project Pilot 1",), ("1000Genomes Project Pilot 2",), ("1000Genomes Project Pilot 3",))
|
||||
|
||||
for select_field, select_field_values in (): #(("INSTRUMENT_PLATFORM", platforms), ("STUDY_NAME", studies)):
|
||||
for count_field in ("CENTER_NAME", "STUDY_NAME", "INSTRUMENT_PLATFORM"):
|
||||
for select_field_value in select_field_values:
|
||||
db.count_fields(select_field, select_field_value, count_field)
|
||||
|
||||
for select_field_value in select_field_values:
|
||||
db.count_bases(((select_field, select_field_value),))
|
||||
|
||||
print
|
||||
|
||||
for field1, value1 in zip(["INSTRUMENT_PLATFORM"]*len(platforms), platforms):
|
||||
for field2, value2 in zip(["STUDY_NAME"]*len(studies), studies):
|
||||
db.count_bases(((field1, value1), (field2, value2)))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Loading…
Reference in New Issue