Added ParseDCCSequenceData.py to repository and made changes that allow an analysis of quantity of sequence data by platform and project, moved table / record system to a new module called FlatFileTable.py and built that into ParseDCCSequenceData and CoverageEval.py; changed lod threshold in CoverageEvalWalker.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1201 348d0f76-0448-11de-a6fe-93d51630548a
2009-07-08 22:04:26 +00:00 · 2009-07-08 22:04:26 +00:00 · c8fcecbc6f
parent 3f0304de5a
commit c8fcecbc6f
4 changed files with 116 additions and 32 deletions
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java
@ -88,7 +88,7 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
                    LocusContext subContext = new LocusContext(context.getLocation(), sub_reads, sub_offsets);
                    AlleleFrequencyEstimate alleleFreq = SSG.map(tracker, ref, subContext);
-                    if (alleleFreq != null && (alleleFreq.lodVsRef >= LOD_THRESHOLD || alleleFreq.lodVsRef <= LOD_THRESHOLD)) {
+                    if (alleleFreq != null) {
                        GenotypeCalls.add(coverage+"\t"+coverage_available+"\t"+hc_genotype+"\t"+alleleFreq.callType()+"\t"+alleleFreq.asGeliString());
                    }
                }
--- a/python/CoverageEval.py
+++ b/python/CoverageEval.py
@ -1,13 +1,6 @@
 #!/usr/bin/env python
-import sys
+import sys, itertools, FlatFileTable
 def chopped_line_generator(filename):
    fin = open(filename)
    fin.readline() # pull off header
    for line in fin:
        line = line.rstrip()
        yield line
 def subset_list_by_indices(indices, list):
    subset = []
@ -15,59 +8,56 @@ def subset_list_by_indices(indices, list):
        subset.append(list[index])
    return subset
-def chunk_generator(line_gen, key_fields):
+def chunk_generator(record_gen, key_fields):
    """Input:
  line_gen: generator that produces lines with linefeeds chopped off
  key_fields: field numbers in each record used to determine chunk membership
 Output:
-  locus_chunk: list of consecutive lines that have the same key_fields
+  locus_chunk: list of consecutive lines that have the same key_fields"""
 """
    locus_chunk = []
    last_key = ""
-    first_line = True
+    first_record = True
-    for line in line_gen:
+    for record in record_gen:
-        fields = line.split()
+        key = [record[f] for f in key_fields]
-        key = subset_list_by_indices(key_fields, fields)
+        if key == last_key or first_record:
-        if key == last_key or first_line:
+            locus_chunk.append(record)
-            locus_chunk.append(line)
+            first_record = False
            first_line = False
        else:
            if locus_chunk != []:
                yield locus_chunk
-                locus_chunk = [line]
+                locus_chunk = [record]
        last_key = key
    yield locus_chunk
 def chunk_stats(chunk):
    records = 0
    conf_calls = 0
    correct_genotype = 0
    for record in chunk:
-        fields = record.split()
+        if abs(float(record["BtnbLod"])) >= 5:
-        if fields[2] == fields[9]:
+            conf_calls += 1
-            correct_genotype += 1
+            if record["HapmapChipGenotype"] == record["BestGenotype"]:
                correct_genotype += 1
        records += 1
-    return float(correct_genotype) / records
+
    return float(correct_genotype) / max(conf_calls,1), float(conf_calls) / max(records,1)
 if __name__ == "__main__":
    if len(sys.argv) < 2:
        sys.exit("Usage: CoverageEval.py geli_file")
    filename = sys.argv[1]
-    fin = open(filename)
+    locus_gen = chunk_generator(FlatFileTable.record_generator(filename, None), ("Sequence","Position"))
    locus_gen = chunk_generator(chopped_line_generator(filename), (4,5))
    print "Fraction correct genotype\tCoverage sampled\tLocus\tReference base\tHapmap chip genotype (Max. coverage genotype call for reference calls)"
    for locus in locus_gen:
        #print "NEW LOCUS"
        covs = dict()
-        coverage_chunk_gen = chunk_generator(locus, (0,4,5))
+        coverage_chunk_gen = chunk_generator(locus, ("DownsampledCoverage", "Sequence", "Position"))
        for cov_chunk in coverage_chunk_gen:
            #print "NEW COVERAGE"
            #print "\n".join(cov_chunk)
-            fields = cov_chunk[0].split()
+            record = cov_chunk[0]
-            coverage = fields[1]
+            print "\t".join(map(str,("%.4f\t%.4f"%chunk_stats(cov_chunk), record["DownsampledCoverage"], record["Sequence"]+":"+record["Position"],record["ReferenceBase"],record["HapmapChipGenotype"])))
            print "\t".join(map(str,("%.2f"%chunk_stats(cov_chunk), coverage, fields[4]+":"+fields[5],fields[6],fields[2])))
            #covs[coverage] = cov_chunk
--- a/python/FlatFileTable.py
+++ b/python/FlatFileTable.py
@ -0,0 +1,19 @@
 #!/usr/bin/env python
 import sys, itertools
 def record_generator(filename, sep="\t"):
    """Given a file with field headers on the first line and records on subsequent lines,
 generates a dictionary for each line keyed by the header fields"""
    fin = open(filename)
    header = fin.readline().rstrip().split() # pull off header
    for line in fin:
        fields = line.rstrip().split(sep)
        record = dict(itertools.izip(header, fields))
        yield record
 def record_matches_values(record, match_field_values):
    for match_field, match_values in match_field_values:
        if record[match_field] not in match_values:
            return False
    return True
--- a/python/ParseDCCSequenceData.py
+++ b/python/ParseDCCSequenceData.py
@ -0,0 +1,75 @@
 #!/usr/bin/env python
 import operator, FlatFileTable
 class db_file:
    filename = ""
    def __init__(self, filenm):
        self.filename = filenm
    def count_fields(self, fixed_field, fixed_field_values, count_field):
        record_gen = FlatFileTable.record_generator(self.filename)
        counts = dict()
        #fixed_field_num = self.field_names[fixed_field]
        print count_field+" for "+fixed_field+" = "+" or ".join(fixed_field_values)
        for record in record_gen:
            if record[fixed_field] in fixed_field_values:
                #fixed_field_value = fields[fixed_field_num]
                count_field_num = record[count_field]
                if counts.has_key(count_field_num):
                    counts[count_field_num] += 1
                else:
                    counts[count_field_num] = 0
        for k,v in sorted(counts.items(), key=operator.itemgetter(1), cmp=lambda x,y: y-x ):
            print str(k)+"\t"+str(v)
    def count_bases(self, fixed_field_values): #, fixed_field_values):
        record_gen = FlatFileTable.record_generator(self.filename)
        base_count = 0
        #fixed_field_num = self.field_names[fixed_field]
        #print "For "+fixed_field+" = "+" or ".join(fixed_field_values)+":",
        print "For "+ " AND ".join( [one_ffv[0]+" = "+" OR ".join(one_ffv[1]) for one_ffv in fixed_field_values] )
        for record in record_gen:
            #if record[fixed_field] in fixed_field_values:
            if FlatFileTable.record_matches_values(record, fixed_field_values):
                try:
                    base_count += int(record["BASE_COUNT"])
                except ValueError:
                    pass
        print "%e bases" % base_count
 if __name__ == "__main__":
    db = db_file("sequence.index")
    platforms = (("ILLUMINA",), ("AB SOLiD","SOLID","ABI_SOLID","AB SOLiD System 2.0"), ("LS454",))
    studies = (("1000Genomes Project Pilot 1",), ("1000Genomes Project Pilot 2",), ("1000Genomes Project Pilot 3",))
    for select_field, select_field_values in (): #(("INSTRUMENT_PLATFORM", platforms), ("STUDY_NAME", studies)):
        for count_field in ("CENTER_NAME", "STUDY_NAME", "INSTRUMENT_PLATFORM"):
            for select_field_value in select_field_values:
                db.count_fields(select_field, select_field_value, count_field)
        for select_field_value in select_field_values:
            db.count_bases(((select_field, select_field_value),))
        print
    for field1, value1 in zip(["INSTRUMENT_PLATFORM"]*len(platforms), platforms):
        for field2, value2 in zip(["STUDY_NAME"]*len(studies), studies):
            db.count_bases(((field1, value1), (field2, value2)))