From c8fcecbc6fd5244bf606c7c0e469f3ad590036b5 Mon Sep 17 00:00:00 2001 From: andrewk Date: Wed, 8 Jul 2009 22:04:26 +0000 Subject: [PATCH] Added ParseDCCSequenceData.py to repository and made changes that allow an analysis of quantity of sequence data by platform and project, moved table / record system to a new module called FlatFileTable.py and built that into ParseDCCSequenceData and CoverageEval.py; changed lod threshold in CoverageEvalWalker. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1201 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/CoverageEvalWalker.java | 2 +- python/CoverageEval.py | 52 ++++++------- python/FlatFileTable.py | 19 +++++ python/ParseDCCSequenceData.py | 75 +++++++++++++++++++ 4 files changed, 116 insertions(+), 32 deletions(-) create mode 100644 python/FlatFileTable.py create mode 100755 python/ParseDCCSequenceData.py diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java index 1599c815b..92f0b4685 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java @@ -88,7 +88,7 @@ public class CoverageEvalWalker extends LocusWalker, String> { LocusContext subContext = new LocusContext(context.getLocation(), sub_reads, sub_offsets); AlleleFrequencyEstimate alleleFreq = SSG.map(tracker, ref, subContext); - if (alleleFreq != null && (alleleFreq.lodVsRef >= LOD_THRESHOLD || alleleFreq.lodVsRef <= LOD_THRESHOLD)) { + if (alleleFreq != null) { GenotypeCalls.add(coverage+"\t"+coverage_available+"\t"+hc_genotype+"\t"+alleleFreq.callType()+"\t"+alleleFreq.asGeliString()); } } diff --git a/python/CoverageEval.py b/python/CoverageEval.py index 34ba63c22..9b18f688c 100755 --- a/python/CoverageEval.py +++ b/python/CoverageEval.py @@ -1,13 +1,6 @@ #!/usr/bin/env python -import sys - -def chopped_line_generator(filename): - fin = open(filename) - fin.readline() # pull off header - for line in fin: - line = line.rstrip() - yield line +import sys, itertools, FlatFileTable def subset_list_by_indices(indices, list): subset = [] @@ -15,59 +8,56 @@ def subset_list_by_indices(indices, list): subset.append(list[index]) return subset -def chunk_generator(line_gen, key_fields): +def chunk_generator(record_gen, key_fields): """Input: line_gen: generator that produces lines with linefeeds chopped off key_fields: field numbers in each record used to determine chunk membership Output: - locus_chunk: list of consecutive lines that have the same key_fields -""" + locus_chunk: list of consecutive lines that have the same key_fields""" locus_chunk = [] last_key = "" - first_line = True - for line in line_gen: - fields = line.split() - key = subset_list_by_indices(key_fields, fields) - if key == last_key or first_line: - locus_chunk.append(line) - first_line = False + first_record = True + for record in record_gen: + key = [record[f] for f in key_fields] + if key == last_key or first_record: + locus_chunk.append(record) + first_record = False else: if locus_chunk != []: yield locus_chunk - locus_chunk = [line] + locus_chunk = [record] last_key = key yield locus_chunk def chunk_stats(chunk): records = 0 + conf_calls = 0 correct_genotype = 0 for record in chunk: - fields = record.split() - if fields[2] == fields[9]: - correct_genotype += 1 + if abs(float(record["BtnbLod"])) >= 5: + conf_calls += 1 + if record["HapmapChipGenotype"] == record["BestGenotype"]: + correct_genotype += 1 records += 1 - return float(correct_genotype) / records + + return float(correct_genotype) / max(conf_calls,1), float(conf_calls) / max(records,1) if __name__ == "__main__": if len(sys.argv) < 2: sys.exit("Usage: CoverageEval.py geli_file") filename = sys.argv[1] - fin = open(filename) - locus_gen = chunk_generator(chopped_line_generator(filename), (4,5)) + locus_gen = chunk_generator(FlatFileTable.record_generator(filename, None), ("Sequence","Position")) print "Fraction correct genotype\tCoverage sampled\tLocus\tReference base\tHapmap chip genotype (Max. coverage genotype call for reference calls)" for locus in locus_gen: #print "NEW LOCUS" covs = dict() - coverage_chunk_gen = chunk_generator(locus, (0,4,5)) + coverage_chunk_gen = chunk_generator(locus, ("DownsampledCoverage", "Sequence", "Position")) for cov_chunk in coverage_chunk_gen: #print "NEW COVERAGE" #print "\n".join(cov_chunk) - fields = cov_chunk[0].split() - coverage = fields[1] - print "\t".join(map(str,("%.2f"%chunk_stats(cov_chunk), coverage, fields[4]+":"+fields[5],fields[6],fields[2]))) - - #covs[coverage] = cov_chunk + record = cov_chunk[0] + print "\t".join(map(str,("%.4f\t%.4f"%chunk_stats(cov_chunk), record["DownsampledCoverage"], record["Sequence"]+":"+record["Position"],record["ReferenceBase"],record["HapmapChipGenotype"]))) diff --git a/python/FlatFileTable.py b/python/FlatFileTable.py new file mode 100644 index 000000000..fe75344c1 --- /dev/null +++ b/python/FlatFileTable.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python + +import sys, itertools + +def record_generator(filename, sep="\t"): + """Given a file with field headers on the first line and records on subsequent lines, +generates a dictionary for each line keyed by the header fields""" + fin = open(filename) + header = fin.readline().rstrip().split() # pull off header + for line in fin: + fields = line.rstrip().split(sep) + record = dict(itertools.izip(header, fields)) + yield record + +def record_matches_values(record, match_field_values): + for match_field, match_values in match_field_values: + if record[match_field] not in match_values: + return False + return True diff --git a/python/ParseDCCSequenceData.py b/python/ParseDCCSequenceData.py new file mode 100755 index 000000000..074da5c01 --- /dev/null +++ b/python/ParseDCCSequenceData.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python + +import operator, FlatFileTable + +class db_file: + filename = "" + + def __init__(self, filenm): + self.filename = filenm + + def count_fields(self, fixed_field, fixed_field_values, count_field): + record_gen = FlatFileTable.record_generator(self.filename) + + counts = dict() + #fixed_field_num = self.field_names[fixed_field] + print count_field+" for "+fixed_field+" = "+" or ".join(fixed_field_values) + + for record in record_gen: + if record[fixed_field] in fixed_field_values: + #fixed_field_value = fields[fixed_field_num] + count_field_num = record[count_field] + if counts.has_key(count_field_num): + counts[count_field_num] += 1 + else: + counts[count_field_num] = 0 + + for k,v in sorted(counts.items(), key=operator.itemgetter(1), cmp=lambda x,y: y-x ): + print str(k)+"\t"+str(v) + + def count_bases(self, fixed_field_values): #, fixed_field_values): + record_gen = FlatFileTable.record_generator(self.filename) + + base_count = 0 + #fixed_field_num = self.field_names[fixed_field] + #print "For "+fixed_field+" = "+" or ".join(fixed_field_values)+":", + print "For "+ " AND ".join( [one_ffv[0]+" = "+" OR ".join(one_ffv[1]) for one_ffv in fixed_field_values] ) + + for record in record_gen: + #if record[fixed_field] in fixed_field_values: + if FlatFileTable.record_matches_values(record, fixed_field_values): + try: + base_count += int(record["BASE_COUNT"]) + except ValueError: + pass + + print "%e bases" % base_count + +if __name__ == "__main__": + db = db_file("sequence.index") + + platforms = (("ILLUMINA",), ("AB SOLiD","SOLID","ABI_SOLID","AB SOLiD System 2.0"), ("LS454",)) + studies = (("1000Genomes Project Pilot 1",), ("1000Genomes Project Pilot 2",), ("1000Genomes Project Pilot 3",)) + + for select_field, select_field_values in (): #(("INSTRUMENT_PLATFORM", platforms), ("STUDY_NAME", studies)): + for count_field in ("CENTER_NAME", "STUDY_NAME", "INSTRUMENT_PLATFORM"): + for select_field_value in select_field_values: + db.count_fields(select_field, select_field_value, count_field) + + for select_field_value in select_field_values: + db.count_bases(((select_field, select_field_value),)) + + print + + for field1, value1 in zip(["INSTRUMENT_PLATFORM"]*len(platforms), platforms): + for field2, value2 in zip(["STUDY_NAME"]*len(studies), studies): + db.count_bases(((field1, value1), (field2, value2))) + + + + + + + + +