More improvements to the duplicate quality combiner, making progress towards a clean system

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@788 348d0f76-0448-11de-a6fe-93d51630548a
2009-05-21 22:26:57 +00:00 · 2009-05-21 22:26:57 +00:00 · 30c63daf89
parent 04e51c8d1d
commit 30c63daf89
5 changed files with 224 additions and 38 deletions
--- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java
+++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java
@ -176,12 +176,12 @@ public class TraverseDuplicates extends TraversalEngine {
                // Send each unique read to the map function
                for ( SAMRecord unique : uniqueReads ) {
                    List<SAMRecord> l = Arrays.asList(unique);
-                    sum = mapOne(dupWalker, l, site, refBases, locus, sum);
+                    sum = mapOne(dupWalker, uniqueReads, l, site, refBases, locus, sum);
                }
            }

            if ( duplicateReads.size() > 0 )
-                sum = mapOne(dupWalker, duplicateReads, site, refBases, locus, sum);
+                sum = mapOne(dupWalker, uniqueReads, duplicateReads, site, refBases, locus, sum);

            printProgress("dups", site);

@ -231,14 +231,15 @@ public class TraverseDuplicates extends TraversalEngine {
    }

    public <M, T> T mapOne(DuplicateWalker<M, T> dupWalker,
-                           List<SAMRecord> readSet,
+                           List<SAMRecord> uniqueReads,
+                           List<SAMRecord> duplicateReads,
                           GenomeLoc site,
                           byte[] refBases,
                           LocusContext locus,
                           T sum) {
-        final boolean keepMeP = dupWalker.filter(site, refBases, locus, readSet);
+        final boolean keepMeP = dupWalker.filter(site, refBases, locus, uniqueReads, duplicateReads);
        if (keepMeP) {
-            M x = dupWalker.map(site, refBases, locus, readSet);
+            M x = dupWalker.map(site, refBases, locus, uniqueReads, duplicateReads);
            sum = dupWalker.reduce(x, sum);
        }
        return sum;
--- a/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java
@ -17,20 +17,23 @@ import net.sf.samtools.SAMRecord;
@Requires({DataSource.READS,DataSource.REFERENCE})
 public abstract class DuplicateWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
    // Do we actually want to operate on the context?
-    public boolean filter(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> duplicateReads) {
+    public boolean filter(GenomeLoc loc, byte[] refBases, LocusContext context,
+                          List<SAMRecord> uniqueReads,
+                          List<SAMRecord> duplicateReads) {
        return true;    // We are keeping all the reads
    }

    /**
     * Called by the traversal engine to decide whether to send non-duplicates as lists of
     * singleton reads to the map function.  By default it's false.
-     * 
+     *
     * @return true if you want to see non duplicates during the traversal
     */
    public boolean mapUniqueReadsTooP() { return false; }

-    // Map over the org.broadinstitute.sting.gatk.LocusContext
-    public abstract MapType map(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> duplicateReads);
+    public abstract MapType map(GenomeLoc loc, byte[] refBases, LocusContext context,
+                                List<SAMRecord> uniqueReads,
+                                List<SAMRecord> duplicateReads);

    // Given result of map function
    public abstract ReduceType reduceInit();
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ReplaceQuals.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ReplaceQuals.java
@ -0,0 +1,133 @@
+package org.broadinstitute.sting.playground.gatk.walkers;
+
+import org.broadinstitute.sting.gatk.LocusContext;
+import org.broadinstitute.sting.gatk.walkers.ReadWalker;
+import org.broadinstitute.sting.utils.cmdLine.Argument;
+import org.broadinstitute.sting.utils.QualityUtils;
+import org.broadinstitute.sting.utils.Pair;
+import net.sf.samtools.*;
+import edu.mit.broad.picard.reference.ReferenceSequence;
+
+import java.util.ArrayList;
+import java.util.Random;
+import java.util.HashMap;
+import java.io.File;
+
+/**
+ * ReadErrorRateWalker assesses the error rate per read position ('cycle') by comparing the
+ * read to its home on the reference and noting the mismatch rate.  It ignores reads with
+ * indels in them, treats high and low-quality references bases the same, and does not count
+ * ambiguous bases as mismatches.  It's also thread-safe, so you can process a slew of reads
+ * in short order.
+ *
+ * @author Kiran Garimella
+ */
+public class ReplaceQuals extends ReadWalker<SAMRecord, SAMFileWriter> {
+    @Argument(shortName="inputQualsBAM",doc="BAM files containing qualities to be replaced",required=true)
+    public String inputQualsBAM;
+
+    @Argument(shortName="outputBAM", required=false, doc="output BAM file for reads with replaced quals")
+    public String outputFilename = null;
+
+    public int MAX_READS_TO_LOAD = -1;
+
+    private HashMap<String, Pair<SAMRecord, SAMRecord>> readNameToPairs;
+    private int READ_PRINT_MOD = 100000;
+    private final boolean DEBUG = false;
+
+    public void initialize() {
+        readNameToPairs = new HashMap<String, Pair<SAMRecord, SAMRecord>>();
+
+        SAMFileReader samReader = new SAMFileReader(new File(inputQualsBAM));
+        samReader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT);
+
+        int nReads = 0;
+        logger.info("Starting to read inputQualsBAM = " + inputQualsBAM);
+        for ( SAMRecord read : samReader ) {
+            //System.out.printf("READ is %s%n", read.format());
+
+            final String name = read.getReadName();
+            Pair<SAMRecord, SAMRecord> binding = readNameToPairs.containsKey(name) ? readNameToPairs.get(name) : new Pair<SAMRecord, SAMRecord>(null, null);
+            if ( read.getFirstOfPairFlag() ) {
+                binding.first = read;
+            } else {
+                binding.second = read;
+            }
+            readNameToPairs.put(name, binding);
+
+            if ( ++nReads % READ_PRINT_MOD == 0 ) {
+                logger.info(String.format("  Read %d reads so far...", nReads));
+            }
+
+            if ( nReads > MAX_READS_TO_LOAD && MAX_READS_TO_LOAD != -1 )
+                break;
+        }
+
+        logger.info("Done reading input BAM");
+    }
+
+    /**
+     *
+     */
+    public SAMRecord map(char[] ref, SAMRecord read) {
+        final String name = read.getReadName();
+
+        if ( readNameToPairs.containsKey(name) ) {
+            Pair<SAMRecord, SAMRecord> binding = readNameToPairs.get(name);
+            SAMRecord qRead = read.getFirstOfPairFlag() ? binding.first : binding.second;
+            if (qRead != null) {
+
+                if ( DEBUG ) {
+                    System.out.printf("Replacing read %s quals with %s%n", read.getReadName(), qRead.getReadName());
+                    System.out.printf("%s%n", read.getReadName());
+                    System.out.printf("%s%n", qRead.getReadName());
+                    System.out.printf("%s%n", read.getReadString());
+                    System.out.printf("%s%n", qRead.getReadString());
+                    System.out.printf("%s%n", read.getBaseQualityString());
+                    System.out.printf("%s%n", qRead.getBaseQualityString());
+
+                    //if (! read.getReadString().equals(qRead.getReadString()))
+                    //    throw new RuntimeException(String.format("BUG: equating %s and %s but bases are different", read.getReadName(), qRead.getReadName()));
+                }
+                
+                read.setBaseQualities(qRead.getBaseQualities());
+            }
+        }
+        
+        return read;
+    }
+
+    // -----------------------------------------------------------------------------------------------
+    // Standard i/o reduce
+    //
+    public void onTraversalDone(SAMFileWriter output) {
+        if ( output != null ) {
+            output.close();
+        }
+    }
+
+    public SAMFileWriter reduceInit() {
+        if ( outputFilename != null ) { // ! outputBamFile.equals("") ) {
+            SAMFileWriterFactory fact = new SAMFileWriterFactory();
+            SAMFileHeader header = this.getToolkit().getEngine().getSAMHeader();
+            return fact.makeBAMWriter(header, true, new File(outputFilename));
+        }
+        else {
+            return null;
+        }
+    }
+
+    /**
+     *
+     *
+     */
+    public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) {
+        if ( output != null ) {
+            output.addAlignment(read);
+        } else {
+            out.println(read.format());
+        }
+
+        return output;
+    }
+}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CombineDuplicatesWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CombineDuplicatesWalker.java
@ -48,9 +48,9 @@ public class CombineDuplicatesWalker extends DuplicateWalker<SAMRecord, SAMFileW
    }

    public SAMFileWriter reduceInit() {
-        if ( outputFilename != null ) { // ! outputBamFile.equals("") ) {
+        if ( outputFilename != null ) {
            SAMFileWriterFactory fact = new SAMFileWriterFactory();
-            SAMFileHeader header = this.getToolkit().getSamReader().getFileHeader();
+            SAMFileHeader header = this.getToolkit().getEngine().getSAMHeader();
            return fact.makeBAMWriter(header, true, new File(outputFilename));
        }
        else {
@ -83,8 +83,11 @@ public class CombineDuplicatesWalker extends DuplicateWalker<SAMRecord, SAMFileW
     * @param duplicateReads
     * @return
     */
-    public SAMRecord map(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> duplicateReads) {
-        //logger.info(String.format("%s has %d duplicates%n", loc, duplicateReads.size()));
+    public SAMRecord map(GenomeLoc loc, byte[] refBases, LocusContext context,
+                         List<SAMRecord> uniqueReads,
+                         List<SAMRecord> duplicateReads) {
+        //logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size()));
+
        SAMRecord combinedRead = null;

        if ( duplicateReads.size() == 1 && ! duplicateReads.get(0).getDuplicateReadFlag() ) {
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/DuplicateQualsWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/DuplicateQualsWalker.java
@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.QualityUtils;
 import org.broadinstitute.sting.utils.Pair;
+import org.broadinstitute.sting.utils.BaseUtils;
 import org.broadinstitute.sting.utils.duplicates.DuplicateComp;
 import org.broadinstitute.sting.utils.duplicates.DupUtils;
 import org.broadinstitute.sting.utils.cmdLine.Argument;
@ -58,51 +59,63 @@ class QualityTracker {
        }
    }

-    public void inc(int b1Q, int b2Q, boolean mismatchP) {
+    public void inc(int b1Qi, int b2Qi, boolean mismatchP, boolean orderDependent) {
+        int b1Q = orderDependent ? b1Qi : Math.max(b1Qi, b2Qi);
+        int b2Q = orderDependent ? b2Qi : Math.min(b1Qi, b2Qi);
+
        if ( b1Q > MAX_QUAL_SCORE ) throw new RuntimeException("Unexpectedly large base quality " + b1Q);
        if ( b2Q > MAX_QUAL_SCORE ) throw new RuntimeException("Unexpectedly large base quality " + b2Q);
+
        mismatchesByQ[b1Q][b2Q].inc(mismatchP);
    }

-    public void inc(DuplicateComp dc) {
-        inc(dc.getQLarger(), dc.getQSmaller(), dc.isMismatchP());
+    public void inc(DuplicateComp dc, boolean orderDependent) {
+        inc(dc.getQLarger(), dc.getQSmaller(), dc.isMismatchP(), orderDependent);
    }

+    public int probMismatchQ1Q2(int q1, int q2) {
+        double e1 = 1 - QualityUtils.qualToProb(q1);
+        double e2 = 1 - QualityUtils.qualToProb(q2);
+        double eMM = e1 * (1 - e2) + (1 - e1) * e2 - 1/3 * e1 * e2;
+        return QualityUtils.probToQual(1 - eMM, 0.0);
+    }
+    
    public void printToStream(PrintStream out, boolean filterUnobserved) {
-        out.printf("Q1\tQ2\t%s%n", mismatchesByQ[0][0].headerString());
+        out.printf("Q1\tQ2\tQmin\t%s%n", mismatchesByQ[0][0].headerString());
        for ( int i = 0; i < MAX_QUAL_SCORE; i++ ) {
            for ( int j = 0; j < MAX_QUAL_SCORE; j++ ) {
                MismatchCounter mc = mismatchesByQ[i][j];
                //System.out.printf("MC = %s%n", mc);
                if ( filterUnobserved && mc.nObs == 0 )
                    continue;
-                out.printf("%d\t%d\t%s\t%n", i, j, mc.toString());
+                out.printf("%d\t%d\t%d\t%s\t%n", i, j, probMismatchQ1Q2(i,j), mc.toString());
            }
        }
    }
 }

-
-/**
- * Created by IntelliJ IDEA.
- * User: mdepristo
- * Date: Feb 22, 2009
- * Time: 2:52:28 PM
- * To change this template use File | Settings | File Templates.
- */
 public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, QualityTracker> {
-    @Argument(fullName="filterUnobservedQuals", shortName="filterUnobservedQuals", required=false, doc="Show only quality bins with at least one observation in the data")
+    @Argument(fullName="filterUnobservedQuals", required=false, doc="Show only quality bins with at least one observation in the data")
    public boolean FILTER_UNOBSERVED_QUALS = false;

-    @Argument(fullName="maxPairwiseCompsPerDupSet", shortName="maxPairwiseCompsPerDupSet", required=false, doc="Maximumize number of pairwise comparisons to perform among duplicate read sets")
+    @Argument(fullName="maxPairwiseCompsPerDupSet", required=false, doc="Maximumize number of pairwise comparisons to perform among duplicate read sets")
    public int MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET = 100;

-    @Argument(fullName="combinedQuals", shortName="combinedQuals", required=false, doc="Combine and assess pairwise base qualities")
+    @Argument(fullName="combinedQuals", required=false, doc="Combine and assess pairwise base qualities")
    public boolean COMBINE_QUALS = false;

-    @Argument(fullName="combineAllDups", shortName="combineAllDups", required=false, doc="Combine and assess pairwise base qualities")
+    @Argument(fullName="combineAllDups", required=false, doc="Combine and assess pairwise base qualities")
    public boolean COMBINE_ALL_DUPS = false;

+    @Argument(fullName="orderDependent", required=false, doc="")
+    public boolean orderDependent = false;
+
+    @Argument(fullName="compareToUniqueReads", required=false, doc="If true, then we will compare only to unique (i.e., non-duplicated molecules) at the same duplicate site")
+    public boolean compareToUniqueReads = false;
+
+    @Argument(fullName="comparePairToSingleton", required=false, doc="If true, then we will compare a combined dup to a random other read in the duplicate set, not a combined pair itself")
+    public boolean comparePairToSingleton = false;
+
    final boolean DEBUG = false;
    final private boolean ACTUALLY_DO_WORK = true;

@ -116,15 +129,17 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q

    public QualityTracker reduce(List<DuplicateComp> dupComps, QualityTracker tracker) {
        for ( DuplicateComp dc : dupComps ) {
-            tracker.inc(dc);
+            tracker.inc(dc, orderDependent);
        }
        
        return tracker;
    }

    // Print out data for regression
-    public List<DuplicateComp> map(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> duplicateReads) {
-        //out.printf("%s has %d duplicates%n", loc, duplicateReads.size());
+    public List<DuplicateComp> map(GenomeLoc loc, byte[] refBases, LocusContext context,
+                                   List<SAMRecord> uniqueReads,
+                                   List<SAMRecord> duplicateReads) {
+        //logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size()));
        List<DuplicateComp> pairwiseComps = new ArrayList<DuplicateComp>();
        
        if ( ! ACTUALLY_DO_WORK )
@ -135,15 +150,21 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
            if ( combinedReads != null ) {
                SAMRecord combined1 = combinedReads.first;
                SAMRecord combined2 = combinedReads.second;
-                addPairwiseMatches( pairwiseComps, combined1, combined2 );
+
+                if ( comparePairToSingleton )
+                    pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, duplicateReads.get(2), uniqueReads );
+                else
+                    pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, combined2, uniqueReads );
            }
        } else {
            int nComparisons = 0;
            for ( SAMRecord read1 : duplicateReads ) {
                for ( SAMRecord read2 : duplicateReads ) {
-                    if ( DupUtils.usableDuplicate(read1, read2) ) {
+                    if ( read1.hashCode() < read2.hashCode() && DupUtils.usableDuplicate(read1, read2) ) {
+                        // the hashcode insures we don't do A vs. B and B vs. A
+                        //System.out.printf("Comparing %s against %s%n", read1, read2);
                        nComparisons++;
-                        addPairwiseMatches( pairwiseComps, read1, read2 );
+                        pairwiseComps = addPairwiseMatches( pairwiseComps, read1, read2, uniqueReads );
                        if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET )
                            break;
                    }
@ -155,7 +176,32 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
    }
    
    private List<DuplicateComp> addPairwiseMatches(List<DuplicateComp> comps,
-                                                 SAMRecord read1, SAMRecord read2 ) {
+                                                   SAMRecord read1, SAMRecord read2,
+                                                   List<SAMRecord> uniqueReads ) {
+        if ( compareToUniqueReads ) {
+            // we want to compare to a read in the unique read set
+            if ( uniqueReads.size() > 0 ) {                 // there's actually something to compare to
+                SAMRecord uniqueRead = uniqueReads.get(0);  // might as well get the first one
+                return pairwiseMatches(comps, read1, uniqueRead);
+            } else {
+                return comps;
+            }
+        } else {
+            // default, just do read1 vs. read2
+            return pairwiseMatches(comps, read1, read2);
+        }
+    }
+
+    /**
+     * Calculates the pairwise mismatches between reads read1 and read2 and adds the result to the comps list.
+     * Doesn't contain any logic deciding what to compare, just does read1 and read2
+     * 
+     * @param comps
+     * @param read1
+     * @param read2
+     * @return
+     */
+    private List<DuplicateComp> pairwiseMatches(List<DuplicateComp> comps, SAMRecord read1, SAMRecord read2 ) {
        byte[] read1Bases = read1.getReadBases();
        byte[] read1Quals = read1.getBaseQualities();
        byte[] read2Bases = read2.getReadBases();
@ -164,7 +210,7 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
        for ( int i = 0; i < read1Bases.length; i++) {
            byte qual1 = read1Quals[i];
            byte qual2 = read2Quals[i];
-            boolean mismatchP = read1Bases[i] != read2Bases[i];
+            boolean mismatchP = ! BaseUtils.basesAreEqual(read1Bases[i], read2Bases[i]);
            DuplicateComp dc = new DuplicateComp(qual1, qual2, mismatchP);
            comps.add(dc);
        }