System for traversing duplicate reads, along with a walker to compute quality scores among duplicates and a smarter method to combine quality scores across duplicates -- v1

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@624 348d0f76-0448-11de-a6fe-93d51630548a
2009-05-07 18:06:02 +00:00 · 2009-05-07 18:06:02 +00:00 · 2204be43eb
parent 71e8f47a6c
commit 2204be43eb
4 changed files with 165 additions and 130 deletions
--- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java
+++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java
@ -15,6 +15,7 @@ import org.broadinstitute.sting.gatk.walkers.Walker;
 import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
 import org.broadinstitute.sting.gatk.walkers.ReadWalker;
 import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.Pair;
 import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;

 import java.io.File;
@ -89,8 +90,9 @@ public class TraverseDuplicates extends TraversalEngine {
        return l;
    }

-    private List<SAMRecord> collectDuplicates(List<SAMRecord> reads) {
-        ArrayList<SAMRecord> dups = new ArrayList<SAMRecord>();
+    private Pair<List<SAMRecord>, List<SAMRecord>> splitDuplicates(List<SAMRecord> reads) {
+        List<SAMRecord> uniques = new ArrayList<SAMRecord>();
+        List<SAMRecord> dups = new ArrayList<SAMRecord>();

        // find the first duplicate
        SAMRecord key = null;
@ -104,7 +106,7 @@ public class TraverseDuplicates extends TraversalEngine {
        }

        // At this point, there are two possibilities, we have found at least one dup or not
-        //System.out.printf("Key is %s%n", key);
+        // if it's a dup, add it to the dups list, otherwise add it to the uniques list 
        if ( key != null ) {
            final GenomeLoc keyLoc = new GenomeLoc(key);
            final GenomeLoc keyMateLoc = new GenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
@ -117,11 +119,15 @@ public class TraverseDuplicates extends TraversalEngine {
                    // we are at the same position as the dup and have the same mat pos, it's a dup
                    if (DEBUG) logger.debug(String.format("  => Adding read to dups list: %s%n", read));
                    dups.add(read);
+                } else {
+                    uniques.add(read);
                }
            }
+        } else {
+            uniques = reads;
        }

-        return dups;
+        return new Pair<List<SAMRecord>, List<SAMRecord>>(uniques, dups);
    }

    /**
@ -146,9 +152,13 @@ public class TraverseDuplicates extends TraversalEngine {
        for (SAMRecord read: iter) {
            // get the genome loc from the read
            GenomeLoc site = new GenomeLoc(read);
-            logger.debug(String.format("*** TraverseDuplicates.traverse at %s", site));
            List<SAMRecord> reads = readsAtLoc(read, iter);
-            List<SAMRecord> duplicateReads = collectDuplicates(reads);
+            Pair<List<SAMRecord>, List<SAMRecord>> split = splitDuplicates(reads);
+            List<SAMRecord> uniqueReads =  split.getFirst();
+            List<SAMRecord> duplicateReads =  split.getSecond();
+
+            logger.debug(String.format("*** TraverseDuplicates.traverse at %s has %d unique and %d duplicate reads",
+                    site, uniqueReads.size(), duplicateReads.size()));

            // Jump forward in the reference to this locus location
            LocusContext locus = new LocusContext(site, duplicateReads, Arrays.asList(0));
@ -161,12 +171,17 @@ public class TraverseDuplicates extends TraversalEngine {

            byte[] refBases = new byte[0];

-            final boolean keepMeP = dupWalker.filter(site, refBases, locus, duplicateReads);
-            if (keepMeP) {
-                M x = dupWalker.map(site, refBases, locus, duplicateReads);
-                sum = dupWalker.reduce(x, sum);
+            if ( dupWalker.mapUniqueReadsTooP() ) {
+                // Send each unique read to the map function
+                for ( SAMRecord unique : uniqueReads ) {
+                    List<SAMRecord> l = Arrays.asList(unique);
+                    sum = mapOne(dupWalker, l, site, refBases, locus, sum);
+                }
            }

+            if ( duplicateReads.size() > 0 )
+                sum = mapOne(dupWalker, duplicateReads, site, refBases, locus, sum);
+
            printProgress("dups", site);

            if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) {
@ -213,7 +228,20 @@ public class TraverseDuplicates extends TraversalEngine {
            return result;
        }
    }
-    
+
+    public <M, T> T mapOne(DuplicateWalker<M, T> dupWalker,
+                           List<SAMRecord> readSet,
+                           GenomeLoc site,
+                           byte[] refBases,
+                           LocusContext locus,
+                           T sum) {
+        final boolean keepMeP = dupWalker.filter(site, refBases, locus, readSet);
+        if (keepMeP) {
+            M x = dupWalker.map(site, refBases, locus, readSet);
+            sum = dupWalker.reduce(x, sum);
+        }
+        return sum;
+    }

    // --------------------------------------------------------------------------------------------------------------
    //
--- a/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java
@ -29,6 +29,15 @@ public abstract class DuplicateWalker<MapType, ReduceType> extends Walker<MapTyp
    public boolean requiresReads()     { return true; }
    public boolean cannotHandleReads() { return false; }

+
+    /**
+     * Called by the traversal engine to decide whether to send non-duplicates as lists of
+     * singleton reads to the map function.  By default it's false.
+     * 
+     * @return true if you want to see non duplicates during the traversal
+     */
+    public boolean mapUniqueReadsTooP() { return false; }
+
    // Map over the org.broadinstitute.sting.gatk.LocusContext
    public abstract MapType map(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> duplicateReads);

--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CombineDuplicatesWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CombineDuplicatesWalker.java
@ -0,0 +1,108 @@
+package org.broadinstitute.sting.playground.gatk.walkers;
+
+import org.broadinstitute.sting.gatk.LocusContext;
+import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.QualityUtils;
+import org.broadinstitute.sting.utils.Pair;
+import org.broadinstitute.sting.utils.duplicates.DuplicateComp;
+import org.broadinstitute.sting.utils.duplicates.DupUtils;
+import org.broadinstitute.sting.utils.cmdLine.Argument;
+
+import java.util.List;
+import java.util.ArrayList;
+import java.io.PrintStream;
+import java.io.File;
+
+import net.sf.samtools.SAMRecord;
+import net.sf.samtools.SAMFileWriter;
+import net.sf.samtools.SAMFileWriterFactory;
+import net.sf.samtools.SAMFileHeader;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: mdepristo
+ * Date: Feb 22, 2009
+ * Time: 2:52:28 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public class CombineDuplicatesWalker extends DuplicateWalker<SAMRecord, SAMFileWriter> {
+    @Argument(fullName="outputBAM", shortName="outputBAM", required=false, defaultValue="", doc="BAM File to write combined duplicates to")
+    public String outputFilename;
+
+    @Argument(fullName="includeUniqueReads", shortName="includeUniqueReads", required=false, defaultValue="true", doc="If true, also writes out non-duplicate reads in file")
+    public boolean INCLUDE_UNIQUE_READS;
+
+    @Argument(fullName="maxQ", shortName="maxQ", required=false, defaultValue="50",
+            doc="The maximum Q score allowed for combined reads, reflects the background error rate giving rise to perfect bases that don't correspond to the reference")
+    public int MAX_QUALITY_SCORE;
+
+    final boolean DEBUG = false;
+
+    public boolean mapUniqueReadsTooP() {
+        return INCLUDE_UNIQUE_READS;
+    }
+
+    // -----------------------------------------------------------------------------------------------
+    // Standard i/o reduce
+    //
+    public void onTraversalDone(SAMFileWriter output) {
+        if ( output != null ) {
+            output.close();
+        }
+    }
+
+    public SAMFileWriter reduceInit() {
+        if ( outputFilename != null ) { // ! outputFile.equals("") ) {
+            SAMFileWriterFactory fact = new SAMFileWriterFactory();
+            SAMFileHeader header = this.getToolkit().getSamReader().getFileHeader();
+            return fact.makeBAMWriter(header, true, new File(outputFilename));
+        }
+        else {
+            return null;
+        }
+    }
+
+    /**
+     * 
+     *
+     */
+    public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) {
+        if ( output != null ) {
+            output.addAlignment(read);
+        } else {
+            out.println(read.format());
+        }
+
+        return output;
+    }
+
+    /**
+     * Build a combined read given the input list of non-unique reads.  If there's just one read in the
+     * set, it's considered unique and returned.  If there's more than one, the N-way combine
+     * duplicate function is invoked.
+     *
+     * @param loc
+     * @param refBases
+     * @param context
+     * @param duplicateReads
+     * @return
+     */
+    public SAMRecord map(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> duplicateReads) {
+        //logger.info(String.format("%s has %d duplicates%n", loc, duplicateReads.size()));
+        SAMRecord combinedRead = null;
+
+        if ( duplicateReads.size() == 1 ) {
+            // we are a unique read
+            combinedRead = duplicateReads.get(0);
+        } else {
+            // actually call the combine function
+            //for (SAMRecord read : duplicateReads ) {
+            //    System.out.printf("Read %s%n", read.format());
+            //}
+            combinedRead = DupUtils.combineDuplicates(duplicateReads, MAX_QUALITY_SCORE);
+        }
+        
+        return combinedRead;
+    }
+}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/DuplicateQualsWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/DuplicateQualsWalker.java
@ -1,20 +1,17 @@
 package org.broadinstitute.sting.playground.gatk.walkers;

-import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
-import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.gatk.LocusContext;
-import org.broadinstitute.sting.gatk.walkers.Walker;
 import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.QualityUtils;
-import org.broadinstitute.sting.utils.Utils;
 import org.broadinstitute.sting.utils.Pair;
+import org.broadinstitute.sting.utils.duplicates.DuplicateComp;
+import org.broadinstitute.sting.utils.duplicates.DupUtils;
 import org.broadinstitute.sting.utils.cmdLine.Argument;

 import java.util.List;
 import java.util.ArrayList;
 import java.io.PrintStream;
-import java.io.FileNotFoundException;

 import net.sf.samtools.SAMRecord;

@ -68,7 +65,7 @@ class QualityTracker {
    }

    public void inc(DuplicateComp dc) {
-        inc(dc.qLarger, dc.qSmaller, dc.mismatchP);
+        inc(dc.getQLarger(), dc.getQSmaller(), dc.isMismatchP());
    }

    public void printToStream(PrintStream out, boolean filterUnobserved) {
@ -85,21 +82,6 @@ class QualityTracker {
    }
 }

-class DuplicateComp {
-    int qLarger;
-    int qSmaller;
-    boolean mismatchP;
-
-    public DuplicateComp( int b1Q, int b2Q, boolean mismatchP ) {
-        qLarger = Math.max(b1Q, b2Q);
-        qSmaller = Math.min(b1Q, b2Q);
-        this.mismatchP = mismatchP;
-    }
-
-    public String toString() {
-        return String.format("%d %d %b", qLarger, qSmaller, mismatchP);
-    }
-}

 /**
 * Created by IntelliJ IDEA.
@ -118,6 +100,9 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
    @Argument(fullName="combinedQuals", shortName="combinedQuals", required=false, doc="Combine and assess pairwise base qualities")
    public boolean COMBINE_QUALS = false;

+    @Argument(fullName="combineAllDups", shortName="combineAllDups", required=false, defaultValue="false", doc="Combine and assess pairwise base qualities")
+    public boolean COMBINE_ALL_DUPS;
+
    final boolean DEBUG = false;
    final private boolean ACTUALLY_DO_WORK = true;

@ -146,18 +131,17 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
            return pairwiseComps;

        if ( COMBINE_QUALS ) {
-            Pair<SAMRecord, SAMRecord> combinedReads = combinedReadPair( duplicateReads );
+            Pair<SAMRecord, SAMRecord> combinedReads = DupUtils.combinedReadPair( duplicateReads );
            if ( combinedReads != null ) {
                SAMRecord combined1 = combinedReads.first;
                SAMRecord combined2 = combinedReads.second;
                addPairwiseMatches( pairwiseComps, combined1, combined2 );
            }
-        }
-        else {
+        } else {
            int nComparisons = 0;
            for ( SAMRecord read1 : duplicateReads ) {
                for ( SAMRecord read2 : duplicateReads ) {
-                    if ( usableDuplicate(read1, read2) ) {
+                    if ( DupUtils.usableDuplicate(read1, read2) ) {
                        nComparisons++;
                        addPairwiseMatches( pairwiseComps, read1, read2 );
                        if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET )
@ -169,10 +153,6 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q

        return pairwiseComps;
    }
-
-    private boolean usableDuplicate( SAMRecord read1, SAMRecord read2 ) {
-            return read1 != read2 && read1.getReadLength() == read2.getReadLength();
-    }
    
    private List<DuplicateComp> addPairwiseMatches(List<DuplicateComp> comps,
                                                 SAMRecord read1, SAMRecord read2 ) {
@ -191,94 +171,4 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q

        return comps;
    }
-
-    private Pair<SAMRecord, SAMRecord> combinedReadPair( List<SAMRecord> duplicateReads ) {
-        if ( duplicateReads.size() < 4 )
-            return null;
-
-        SAMRecord c1 = combineDuplicates(duplicateReads.get(0),duplicateReads.get(1));
-        SAMRecord c2 = combineDuplicates(duplicateReads.get(2),duplicateReads.get(3));
-        return new Pair<SAMRecord, SAMRecord>(c1, c2);
-    }
-
-    private SAMRecord sample3rdRead( List<SAMRecord> duplicateReads, SAMRecord read1, SAMRecord read2 ) {
-         if ( duplicateReads.size() <= 2 ) {
-             // no third unique read is available
-             return null;
-         } else {
-             for ( SAMRecord read3 : duplicateReads ) {
-                 if ( usableDuplicate(read1, read3) && usableDuplicate(read2, read3) )
-                     return read3;
-             }
-             
-             return null;
-         }
-    }
-
-    private SAMRecord tmpCopyRead(SAMRecord read) {
-        SAMRecord copy = new SAMRecord(read.getHeader());
-        copy.setReadName(read.getReadName());
-        //copy.setReadString(final String value) {
-        copy.setReadBases(read.getReadBases());
-        copy.setBaseQualities(read.getBaseQualities());
-        copy.setReferenceName(read.getReferenceName());
-        copy.setReferenceIndex(read.getReferenceIndex());
-        copy.setMateReferenceName(read.getMateReferenceName());
-        copy.setMateReferenceIndex(read.getMateReferenceIndex());
-        copy.setAlignmentStart(read.getAlignmentStart());
-                //copy.setAlignmentEnd(read.getAlignmentEnd());
-        copy.setMateAlignmentStart(read.getMateAlignmentStart());
-        copy.setInferredInsertSize(read.getInferredInsertSize());
-        copy.setMappingQuality(read.getMappingQuality());
-        copy.setCigar(read.getCigar());
-        copy.setFlags(copy.getFlags());
-        
-        return copy;
-    }
-
-    private SAMRecord combineDuplicates(SAMRecord read1, SAMRecord read2) {
-        byte[] read1Bases = read1.getReadBases();
-        byte[] read1Quals = read1.getBaseQualities();
-        byte[] read2Bases = read2.getReadBases();
-        byte[] read2Quals = read2.getBaseQualities();
-
-        byte[] bases = new byte[read1Bases.length];
-        byte[] quals = new byte[read1Bases.length];
-
-        SAMRecord c = tmpCopyRead(read1);
-        for ( int i = 0; i < read1Bases.length; i++) {
-            byte base1 = read1Bases[i];
-            byte base2 = read2Bases[i];
-            byte qual1 = read1Quals[i];
-            byte qual2 = read2Quals[i];
-            final double p1 = QualityUtils.qualToProb(qual1);
-            final double p2 = QualityUtils.qualToProb(qual2);
-
-            double pc;
-            byte basec;
-
-            if ( base1 == base2 ) {
-                // agreement
-                basec = base1;
-                pc = 1 - (1 - p1) * (1 - p2);
-            } else {
-                // disagreement
-                basec = p1 > p2 ? base1 : base2;
-                pc    = p1 > p2 ? p1 : p2;
-                //pc = 0;
-            }
-
-            bases[i] = basec;
-            quals[i] = QualityUtils.probToQual(pc, 0.0);
-
-            if ( DEBUG )
-                logger.debug(String.format("Combining %s (Q%2d) with %s (Q%2d) -> %s (Q%2d)%s%n",
-                    (char)base1, qual1, (char)base2, qual2, (char)bases[i], quals[i],
-                    base1 == base2 ? "" : " [MISMATCH]"));
-        }
-        c.setReadBases(bases);
-        c.setBaseQualities(quals);
-
-        return c;
-    }
 }