Computes the read error rate for a bam file. Ignores reads with indels, treats low-quality and high-quality reference bases the same. Does not count ambiguous reference bases as mismatches. Optionally allows for best two bases in read to be used.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@330 348d0f76-0448-11de-a6fe-93d51630548a
2009-04-08 15:56:10 +00:00 · 2009-04-08 15:56:10 +00:00 · 25474ebe7e
parent 59b2e6a90f
commit 25474ebe7e
1 changed files with 104 additions and 0 deletions
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ReadErrorRateWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ReadErrorRateWalker.java
@ -0,0 +1,104 @@
+package org.broadinstitute.sting.playground.gatk.walkers;
+
+import org.broadinstitute.sting.gatk.LocusContext;
+import org.broadinstitute.sting.gatk.walkers.ReadWalker;
+import org.broadinstitute.sting.utils.cmdLine.Argument;
+import org.broadinstitute.sting.utils.QualityUtils;
+import net.sf.samtools.SAMRecord;
+import edu.mit.broad.picard.reference.ReferenceSequence;
+
+/**
+ * ReadErrorRateWalker assesses the error rate per read position ('cycle') by comparing
+ */
+public class ReadErrorRateWalker extends ReadWalker<boolean[], int[]> {
+    @Argument(fullName="useNextBestBase",required=false,defaultValue="false")
+    public boolean useNextBestBase;
+
+    public boolean filter(LocusContext context, SAMRecord read) {
+        return (read.getCigar().numCigarElements() == 1);
+    }
+
+    public boolean[] map(LocusContext context, SAMRecord read) {
+        boolean[] errorsPerCycle = new boolean[read.getReadLength() + 1];
+
+        byte[] bases  = read.getReadBases();
+        byte[] contig = context.getReferenceContig().getBases();
+        byte[] sq     = (byte[]) read.getAttribute("SQ");
+
+        int totalMismatches = 0;
+
+        for (int cycle = 0, offset = (int) context.getPosition(); cycle < bases.length; cycle++, offset++) {
+            byte compBase;
+
+            switch (contig[offset]) {
+                case 'A':
+                case 'a': compBase = 'A'; break;
+                case 'C':
+                case 'c': compBase = 'C'; break;
+                case 'G':
+                case 'g': compBase = 'G'; break;
+                case 'T':
+                case 't': compBase = 'T'; break;
+                default:  compBase = '.'; break;
+            }
+
+            if (compBase != '.') {
+                if (useNextBestBase) {
+                    int nextBestBaseIndex = QualityUtils.compressedQualityToBaseIndex(sq[cycle]);
+                    byte nextBestBase;
+                    switch (nextBestBaseIndex) {
+                        case 0: nextBestBase = 'A'; break;
+                        case 1: nextBestBase = 'C'; break;
+                        case 2: nextBestBase = 'G'; break;
+                        case 3: nextBestBase = 'T'; break;
+                        default: nextBestBase = '.'; break;
+                    }
+
+                    if (nextBestBase != '.') {
+                        errorsPerCycle[cycle] = !(bases[cycle] == compBase || bases[cycle] == nextBestBase);
+                        totalMismatches = !(bases[cycle] == compBase || bases[cycle] == nextBestBase) ? 1 : 0;
+                    }
+                } else {
+                    errorsPerCycle[cycle] = (bases[cycle] != compBase);
+                    totalMismatches += (bases[cycle] != compBase) ? 1 : 0;
+                }
+            }
+        }
+
+        if (totalMismatches > 4) {
+            for (int cycle = 0; cycle < bases.length; cycle++) { System.out.print((char) bases[cycle]); } System.out.print("\n");
+            for (int cycle = 0, offset = (int) context.getPosition(); cycle < bases.length; cycle++, offset++) { System.out.print((char) contig[offset]); } System.out.print("\n");
+            System.out.println(totalMismatches + "\n");
+        }
+
+        // We encode that we saw a read in the last position of the array.
+        // That way we know what to normalize by, and we get thread safety!
+        errorsPerCycle[errorsPerCycle.length - 1] = true;
+
+        return errorsPerCycle;
+    }
+
+    public int[] reduceInit() {
+        return null;
+    }
+
+    public int[] reduce(boolean[] value, int[] sum) {
+        if (sum == null) {
+            sum = new int[value.length];
+        }
+
+        for (int cycle = 0; cycle < value.length; cycle++) {
+            sum[cycle] += (value[cycle] ? 1 : 0);
+        }
+
+        return sum;
+    }
+
+    public void onTraversalDone(int[] sum) {
+        for (int cycle = 0; cycle < sum.length - 1; cycle++) {
+            double errorrate = ((double) sum[cycle])/((double) sum[sum.length - 1]);
+            System.out.println("[ERROR_RATE] " + cycle + " " + errorrate);
+        }
+    }
+
+}