Added - PairedQualityScoreCountsWalker: counts quality scores (e.g. as a histogram) on first reads of a pair and second reads of a pair. Turns out there's a consistent difference in quality scores; even after recalibrating without the pair ordering as a covariate (there's a bit of averaging -- but not as much as I initially thought).

Added - A paired read order covariate to use with recalibration. Currently experimental: for instance, what's a proper pair versus just a pair? Nobody should use this one... git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2401 348d0f76-0448-11de-a6fe-93d51630548a
2009-12-18 15:01:01 +00:00 · 2009-12-18 15:01:01 +00:00 · 7b5e332ff3
parent 4f59bfd513
commit 7b5e332ff3
2 changed files with 135 additions and 0 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PairedReadOrderCovariate.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PairedReadOrderCovariate.java
@ -0,0 +1,28 @@
+package org.broadinstitute.sting.gatk.walkers.recalibration;
+
+import net.sf.samtools.SAMRecord;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: chartl
+ * Date: Dec 16, 2009
+ * Time: 3:22:19 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public class PairedReadOrderCovariate implements ExperimentalCovariate{
+
+    public void initialize (final RecalibrationArgumentCollection rac ) { /* do nothing */ }
+
+    public final Comparable getValue(final SAMRecord read, final int offset) {
+        return read.getReadPairedFlag() ? "Not_Paired" : read.getMateUnmappedFlag() ? "Mate_Unmapped" : read.getFirstOfPairFlag() ? "First_Read" : "Second_Read";
+    }
+
+    public final Comparable getValue( final String str ) {
+        return Integer.parseInt( str );
+    }
+
+    // Used to estimate the amount space required for the full data HashMap
+    public final int estimatedNumberOfBins() {
+        return 4;
+    }
+}
--- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/PairedQualityScoreCountsWalker.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/PairedQualityScoreCountsWalker.java
@ -0,0 +1,107 @@
+package org.broadinstitute.sting.oneoffprojects.walkers;
+
+import org.broadinstitute.sting.gatk.walkers.ReadWalker;
+import org.broadinstitute.sting.utils.Pair;
+import org.broadinstitute.sting.utils.QualityUtils;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.cmdLine.Argument;
+import net.sf.samtools.SAMRecord;
+
+/**
+ * This walker prints out quality score counts for first and second reads of a pair aggregated over all reads
+ * in the interval.
+ *
+ * @Author: Chris Hartl
+ */
+public class PairedQualityScoreCountsWalker extends ReadWalker<Pair<byte[],Boolean>,Pair<CycleQualCounts,CycleQualCounts>> {
+    @Argument(fullName="readLength", shortName="rl", doc="Length of reads in the bam file", required=true)
+    public int readLength = -1;
+
+    public void initialize() { return; }
+
+    public Pair<CycleQualCounts,CycleQualCounts> reduceInit() {
+        return new Pair<CycleQualCounts,CycleQualCounts>( new CycleQualCounts(readLength), new CycleQualCounts(readLength) );
+    }
+
+    public Pair<CycleQualCounts,CycleQualCounts> reduce( Pair<byte[],Boolean> mapCounts, Pair<CycleQualCounts,CycleQualCounts> reduceCounts ) {
+        if ( mapCounts != null ) {
+            if ( mapCounts.second ) {
+                reduceCounts.first.update(mapCounts.first);
+            } else {
+                reduceCounts.second.update(mapCounts.first);
+            }
+        }
+
+        return reduceCounts;
+    }
+
+    public Pair<byte[],Boolean> map( char[] ref, SAMRecord read) {
+        if ( canUseRead(read) ) {
+            return getCorrectlyOrientedBaseQualities(read);
+        } else {
+            return null;
+        }
+    }
+
+    private boolean canUseRead(SAMRecord read) {
+        return ( ! read.getMateUnmappedFlag() && ! read.getReadUnmappedFlag() ) && ( read.getReadPairedFlag() && read.getReadLength() == readLength );
+    }
+
+    private Pair<byte[],Boolean> getCorrectlyOrientedBaseQualities(SAMRecord read) {
+        byte[] quals = read.getReadNegativeStrandFlag() ? BaseUtils.reverse(read.getBaseQualities()) : read.getBaseQualities();
+        return new Pair<byte[], Boolean>(quals, read.getFirstOfPairFlag());
+    }
+
+    public void onTraversalDone(Pair<CycleQualCounts,CycleQualCounts> finalCounts) {
+        StringBuilder output = new StringBuilder();
+        output.append(String.format("%s\t%s\t%s%n","Cycle","First_read_counts","Second_read_counts"));
+        for ( int offset = 0; offset < readLength; offset++ ) {
+            output.append(String.format("%d\t%s\t%s%n",offset,finalCounts.first.getCountDistribution(offset),finalCounts.second.getCountDistribution(offset)));
+        }
+        out.printf("%s",output.toString());
+    }
+
+}
+
+class CycleQualCounts {
+    private long[][] qualityCountsByCycle;
+    private int cycleLength;
+    private int qualMax = QualityUtils.MAX_REASONABLE_Q_SCORE + 1;
+
+    public CycleQualCounts(int cycleLength) {
+        this.cycleLength = cycleLength;
+        qualityCountsByCycle = new long[cycleLength][qualMax];
+        for ( int cycle = 0; cycle < cycleLength; cycle++ ) {
+            for ( int qual = 0; qual < qualMax; qual++) {
+                qualityCountsByCycle[cycle][qual] = 0;
+            }
+        }
+    }
+
+    public void update(int offset, byte quality) {
+        qualityCountsByCycle[offset][qualityToQualityIndex(quality)]++;
+    }
+
+
+    public void update(byte[] qualArray) {
+        for ( int o = 0; o < cycleLength; o++ ) {
+            update(o,qualArray[o]);
+        }
+    }
+
+    private int qualityToQualityIndex(byte qual) {
+        return qual < 0 ? 0 : qual > qualMax ? qualMax : qual;
+    }
+
+    public long[][] getCounts() { return qualityCountsByCycle; }
+
+    public String getCountDistribution(int offset) {
+        StringBuilder b = new StringBuilder();
+        for ( int qual = 0; qual < qualMax-1; qual++ ) {
+            b.append(String.format("%d;",qualityCountsByCycle[offset][qual]));    
+        }
+        b.append(String.format("%d",qualityCountsByCycle[offset][qualMax-1]));
+
+        return b.toString();
+    }
+}