Added ParseDCCSequenceData.py to repository and made changes that allow an analysis of quantity of sequence data by platform and project, moved table / record system to a new module called FlatFileTable.py and built that into ParseDCCSequenceData and CoverageEval.py; changed lod threshold in CoverageEvalWalker.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1201 348d0f76-0448-11de-a6fe-93d51630548a
2009-07-08 22:04:26 +00:00 · 2009-07-08 22:04:26 +00:00 · c8fcecbc6f
parent 3f0304de5a
commit c8fcecbc6f
4 changed files with 116 additions and 32 deletions
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java
@ -88,7 +88,7 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
                    LocusContext subContext = new LocusContext(context.getLocation(), sub_reads, sub_offsets);
                    AlleleFrequencyEstimate alleleFreq = SSG.map(tracker, ref, subContext);

-                    if (alleleFreq != null && (alleleFreq.lodVsRef >= LOD_THRESHOLD || alleleFreq.lodVsRef <= LOD_THRESHOLD)) {
+                    if (alleleFreq != null) {
                        GenotypeCalls.add(coverage+"\t"+coverage_available+"\t"+hc_genotype+"\t"+alleleFreq.callType()+"\t"+alleleFreq.asGeliString());
                    }
                }
--- a/python/CoverageEval.py
+++ b/python/CoverageEval.py
@ -1,13 +1,6 @@
 #!/usr/bin/env python

-import sys
-
-def chopped_line_generator(filename):
-    fin = open(filename)
-    fin.readline() # pull off header
-    for line in fin:
-        line = line.rstrip()
-        yield line
+import sys, itertools, FlatFileTable

 def subset_list_by_indices(indices, list):
    subset = []
@ -15,59 +8,56 @@ def subset_list_by_indices(indices, list):
        subset.append(list[index])
    return subset

-def chunk_generator(line_gen, key_fields):
+def chunk_generator(record_gen, key_fields):
    """Input:
  line_gen: generator that produces lines with linefeeds chopped off
  key_fields: field numbers in each record used to determine chunk membership
 Output:
-  locus_chunk: list of consecutive lines that have the same key_fields
-"""
+  locus_chunk: list of consecutive lines that have the same key_fields"""
  
    locus_chunk = []
    last_key = ""
-    first_line = True
-    for line in line_gen:
-        fields = line.split()
-        key = subset_list_by_indices(key_fields, fields)
-        if key == last_key or first_line:
-            locus_chunk.append(line)
-            first_line = False
+    first_record = True
+    for record in record_gen:
+        key = [record[f] for f in key_fields]
+        if key == last_key or first_record:
+            locus_chunk.append(record)
+            first_record = False
        else:
            if locus_chunk != []:
                yield locus_chunk
-                locus_chunk = [line]
+                locus_chunk = [record]
        last_key = key
    yield locus_chunk

 def chunk_stats(chunk):
    records = 0
+    conf_calls = 0
    correct_genotype = 0
    for record in chunk:
-        fields = record.split()
-        if fields[2] == fields[9]:
-            correct_genotype += 1
+        if abs(float(record["BtnbLod"])) >= 5:
+            conf_calls += 1
+            if record["HapmapChipGenotype"] == record["BestGenotype"]:
+                correct_genotype += 1
        records += 1
-    return float(correct_genotype) / records
+
+    return float(correct_genotype) / max(conf_calls,1), float(conf_calls) / max(records,1)

 if __name__ == "__main__":
    if len(sys.argv) < 2:
        sys.exit("Usage: CoverageEval.py geli_file")
    filename = sys.argv[1]

-    fin = open(filename)
-    locus_gen = chunk_generator(chopped_line_generator(filename), (4,5))
+    locus_gen = chunk_generator(FlatFileTable.record_generator(filename, None), ("Sequence","Position"))
    print "Fraction correct genotype\tCoverage sampled\tLocus\tReference base\tHapmap chip genotype (Max. coverage genotype call for reference calls)"
    for locus in locus_gen:
        #print "NEW LOCUS"
        covs = dict()
-        coverage_chunk_gen = chunk_generator(locus, (0,4,5))
+        coverage_chunk_gen = chunk_generator(locus, ("DownsampledCoverage", "Sequence", "Position"))
        for cov_chunk in coverage_chunk_gen:
            #print "NEW COVERAGE"
            #print "\n".join(cov_chunk)
-            fields = cov_chunk[0].split()
-            coverage = fields[1]
-            print "\t".join(map(str,("%.2f"%chunk_stats(cov_chunk), coverage, fields[4]+":"+fields[5],fields[6],fields[2])))
-            
-            #covs[coverage] = cov_chunk
+            record = cov_chunk[0]
+            print "\t".join(map(str,("%.4f\t%.4f"%chunk_stats(cov_chunk), record["DownsampledCoverage"], record["Sequence"]+":"+record["Position"],record["ReferenceBase"],record["HapmapChipGenotype"])))
            
            
--- a/python/FlatFileTable.py
+++ b/python/FlatFileTable.py
@ -0,0 +1,19 @@
+#!/usr/bin/env python
+
+import sys, itertools
+
+def record_generator(filename, sep="\t"):
+    """Given a file with field headers on the first line and records on subsequent lines,
+generates a dictionary for each line keyed by the header fields"""
+    fin = open(filename)
+    header = fin.readline().rstrip().split() # pull off header
+    for line in fin:
+        fields = line.rstrip().split(sep)
+        record = dict(itertools.izip(header, fields))
+        yield record
+
+def record_matches_values(record, match_field_values):
+    for match_field, match_values in match_field_values:
+        if record[match_field] not in match_values:
+            return False
+    return True
--- a/python/ParseDCCSequenceData.py
+++ b/python/ParseDCCSequenceData.py
@ -0,0 +1,75 @@
+#!/usr/bin/env python
+
+import operator, FlatFileTable
+
+class db_file:
+    filename = ""
+
+    def __init__(self, filenm):
+        self.filename = filenm
+
+    def count_fields(self, fixed_field, fixed_field_values, count_field):
+        record_gen = FlatFileTable.record_generator(self.filename)
+
+        counts = dict()
+        #fixed_field_num = self.field_names[fixed_field]
+        print count_field+" for "+fixed_field+" = "+" or ".join(fixed_field_values)
+
+        for record in record_gen:
+            if record[fixed_field] in fixed_field_values:
+                #fixed_field_value = fields[fixed_field_num]
+                count_field_num = record[count_field]
+                if counts.has_key(count_field_num):
+                    counts[count_field_num] += 1
+                else:
+                    counts[count_field_num] = 0
+
+        for k,v in sorted(counts.items(), key=operator.itemgetter(1), cmp=lambda x,y: y-x ):
+            print str(k)+"\t"+str(v)
+
+    def count_bases(self, fixed_field_values): #, fixed_field_values):
+        record_gen = FlatFileTable.record_generator(self.filename)
+
+        base_count = 0
+        #fixed_field_num = self.field_names[fixed_field]
+        #print "For "+fixed_field+" = "+" or ".join(fixed_field_values)+":",
+        print "For "+ " AND ".join( [one_ffv[0]+" = "+" OR ".join(one_ffv[1]) for one_ffv in fixed_field_values] )
+
+        for record in record_gen:
+            #if record[fixed_field] in fixed_field_values:
+            if FlatFileTable.record_matches_values(record, fixed_field_values):
+                try:
+                    base_count += int(record["BASE_COUNT"])
+                except ValueError:
+                    pass
+
+        print "%e bases" % base_count
+
+if __name__ == "__main__":
+    db = db_file("sequence.index")
+    
+    platforms = (("ILLUMINA",), ("AB SOLiD","SOLID","ABI_SOLID","AB SOLiD System 2.0"), ("LS454",))
+    studies = (("1000Genomes Project Pilot 1",), ("1000Genomes Project Pilot 2",), ("1000Genomes Project Pilot 3",))
+
+    for select_field, select_field_values in (): #(("INSTRUMENT_PLATFORM", platforms), ("STUDY_NAME", studies)):
+        for count_field in ("CENTER_NAME", "STUDY_NAME", "INSTRUMENT_PLATFORM"):
+            for select_field_value in select_field_values:
+                db.count_fields(select_field, select_field_value, count_field)
+                
+        for select_field_value in select_field_values:
+            db.count_bases(((select_field, select_field_value),))
+
+        print
+
+    for field1, value1 in zip(["INSTRUMENT_PLATFORM"]*len(platforms), platforms):
+        for field2, value2 in zip(["STUDY_NAME"]*len(studies), studies):
+            db.count_bases(((field1, value1), (field2, value2)))
+
+
+
+
+
+
+
+
+