From 2204be43ebf5d3e1895cd039eadd34bfc10de6da Mon Sep 17 00:00:00 2001 From: depristo Date: Thu, 7 May 2009 18:06:02 +0000 Subject: [PATCH] System for traversing duplicate reads, along with a walker to compute quality scores among duplicates and a smarter method to combine quality scores across duplicates -- v1 git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@624 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/traversals/TraverseDuplicates.java | 50 +++++-- .../sting/gatk/walkers/DuplicateWalker.java | 9 ++ .../gatk/walkers/CombineDuplicatesWalker.java | 108 +++++++++++++++ .../gatk/walkers/DuplicateQualsWalker.java | 128 ++---------------- 4 files changed, 165 insertions(+), 130 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/playground/gatk/walkers/CombineDuplicatesWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java index b8f783c93..2cf46508e 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java @@ -15,6 +15,7 @@ import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.gatk.walkers.DuplicateWalker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2; import java.io.File; @@ -89,8 +90,9 @@ public class TraverseDuplicates extends TraversalEngine { return l; } - private List collectDuplicates(List reads) { - ArrayList dups = new ArrayList(); + private Pair, List> splitDuplicates(List reads) { + List uniques = new ArrayList(); + List dups = new ArrayList(); // find the first duplicate SAMRecord key = null; @@ -104,7 +106,7 @@ public class TraverseDuplicates extends TraversalEngine { } // At this point, there are two possibilities, we have found at least one dup or not - //System.out.printf("Key is %s%n", key); + // if it's a dup, add it to the dups list, otherwise add it to the uniques list if ( key != null ) { final GenomeLoc keyLoc = new GenomeLoc(key); final GenomeLoc keyMateLoc = new GenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart()); @@ -117,11 +119,15 @@ public class TraverseDuplicates extends TraversalEngine { // we are at the same position as the dup and have the same mat pos, it's a dup if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s%n", read)); dups.add(read); + } else { + uniques.add(read); } } + } else { + uniques = reads; } - return dups; + return new Pair, List>(uniques, dups); } /** @@ -146,9 +152,13 @@ public class TraverseDuplicates extends TraversalEngine { for (SAMRecord read: iter) { // get the genome loc from the read GenomeLoc site = new GenomeLoc(read); - logger.debug(String.format("*** TraverseDuplicates.traverse at %s", site)); List reads = readsAtLoc(read, iter); - List duplicateReads = collectDuplicates(reads); + Pair, List> split = splitDuplicates(reads); + List uniqueReads = split.getFirst(); + List duplicateReads = split.getSecond(); + + logger.debug(String.format("*** TraverseDuplicates.traverse at %s has %d unique and %d duplicate reads", + site, uniqueReads.size(), duplicateReads.size())); // Jump forward in the reference to this locus location LocusContext locus = new LocusContext(site, duplicateReads, Arrays.asList(0)); @@ -161,12 +171,17 @@ public class TraverseDuplicates extends TraversalEngine { byte[] refBases = new byte[0]; - final boolean keepMeP = dupWalker.filter(site, refBases, locus, duplicateReads); - if (keepMeP) { - M x = dupWalker.map(site, refBases, locus, duplicateReads); - sum = dupWalker.reduce(x, sum); + if ( dupWalker.mapUniqueReadsTooP() ) { + // Send each unique read to the map function + for ( SAMRecord unique : uniqueReads ) { + List l = Arrays.asList(unique); + sum = mapOne(dupWalker, l, site, refBases, locus, sum); + } } + if ( duplicateReads.size() > 0 ) + sum = mapOne(dupWalker, duplicateReads, site, refBases, locus, sum); + printProgress("dups", site); if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) { @@ -213,7 +228,20 @@ public class TraverseDuplicates extends TraversalEngine { return result; } } - + + public T mapOne(DuplicateWalker dupWalker, + List readSet, + GenomeLoc site, + byte[] refBases, + LocusContext locus, + T sum) { + final boolean keepMeP = dupWalker.filter(site, refBases, locus, readSet); + if (keepMeP) { + M x = dupWalker.map(site, refBases, locus, readSet); + sum = dupWalker.reduce(x, sum); + } + return sum; + } // -------------------------------------------------------------------------------------------------------------- // diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java index 1fc494e6e..00e03d677 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java @@ -29,6 +29,15 @@ public abstract class DuplicateWalker extends Walker duplicateReads); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CombineDuplicatesWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CombineDuplicatesWalker.java new file mode 100644 index 000000000..e79f0867d --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CombineDuplicatesWalker.java @@ -0,0 +1,108 @@ +package org.broadinstitute.sting.playground.gatk.walkers; + +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.gatk.walkers.DuplicateWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.duplicates.DuplicateComp; +import org.broadinstitute.sting.utils.duplicates.DupUtils; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +import java.util.List; +import java.util.ArrayList; +import java.io.PrintStream; +import java.io.File; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMFileWriter; +import net.sf.samtools.SAMFileWriterFactory; +import net.sf.samtools.SAMFileHeader; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 22, 2009 + * Time: 2:52:28 PM + * To change this template use File | Settings | File Templates. + */ +public class CombineDuplicatesWalker extends DuplicateWalker { + @Argument(fullName="outputBAM", shortName="outputBAM", required=false, defaultValue="", doc="BAM File to write combined duplicates to") + public String outputFilename; + + @Argument(fullName="includeUniqueReads", shortName="includeUniqueReads", required=false, defaultValue="true", doc="If true, also writes out non-duplicate reads in file") + public boolean INCLUDE_UNIQUE_READS; + + @Argument(fullName="maxQ", shortName="maxQ", required=false, defaultValue="50", + doc="The maximum Q score allowed for combined reads, reflects the background error rate giving rise to perfect bases that don't correspond to the reference") + public int MAX_QUALITY_SCORE; + + final boolean DEBUG = false; + + public boolean mapUniqueReadsTooP() { + return INCLUDE_UNIQUE_READS; + } + + // ----------------------------------------------------------------------------------------------- + // Standard i/o reduce + // + public void onTraversalDone(SAMFileWriter output) { + if ( output != null ) { + output.close(); + } + } + + public SAMFileWriter reduceInit() { + if ( outputFilename != null ) { // ! outputFile.equals("") ) { + SAMFileWriterFactory fact = new SAMFileWriterFactory(); + SAMFileHeader header = this.getToolkit().getSamReader().getFileHeader(); + return fact.makeBAMWriter(header, true, new File(outputFilename)); + } + else { + return null; + } + } + + /** + * + * + */ + public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) { + if ( output != null ) { + output.addAlignment(read); + } else { + out.println(read.format()); + } + + return output; + } + + /** + * Build a combined read given the input list of non-unique reads. If there's just one read in the + * set, it's considered unique and returned. If there's more than one, the N-way combine + * duplicate function is invoked. + * + * @param loc + * @param refBases + * @param context + * @param duplicateReads + * @return + */ + public SAMRecord map(GenomeLoc loc, byte[] refBases, LocusContext context, List duplicateReads) { + //logger.info(String.format("%s has %d duplicates%n", loc, duplicateReads.size())); + SAMRecord combinedRead = null; + + if ( duplicateReads.size() == 1 ) { + // we are a unique read + combinedRead = duplicateReads.get(0); + } else { + // actually call the combine function + //for (SAMRecord read : duplicateReads ) { + // System.out.printf("Read %s%n", read.format()); + //} + combinedRead = DupUtils.combineDuplicates(duplicateReads, MAX_QUALITY_SCORE); + } + + return combinedRead; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/DuplicateQualsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/DuplicateQualsWalker.java index e7b5672b7..8a123a290 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/DuplicateQualsWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/DuplicateQualsWalker.java @@ -1,20 +1,17 @@ package org.broadinstitute.sting.playground.gatk.walkers; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.LocusContext; -import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.gatk.walkers.DuplicateWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.duplicates.DuplicateComp; +import org.broadinstitute.sting.utils.duplicates.DupUtils; import org.broadinstitute.sting.utils.cmdLine.Argument; import java.util.List; import java.util.ArrayList; import java.io.PrintStream; -import java.io.FileNotFoundException; import net.sf.samtools.SAMRecord; @@ -68,7 +65,7 @@ class QualityTracker { } public void inc(DuplicateComp dc) { - inc(dc.qLarger, dc.qSmaller, dc.mismatchP); + inc(dc.getQLarger(), dc.getQSmaller(), dc.isMismatchP()); } public void printToStream(PrintStream out, boolean filterUnobserved) { @@ -85,21 +82,6 @@ class QualityTracker { } } -class DuplicateComp { - int qLarger; - int qSmaller; - boolean mismatchP; - - public DuplicateComp( int b1Q, int b2Q, boolean mismatchP ) { - qLarger = Math.max(b1Q, b2Q); - qSmaller = Math.min(b1Q, b2Q); - this.mismatchP = mismatchP; - } - - public String toString() { - return String.format("%d %d %b", qLarger, qSmaller, mismatchP); - } -} /** * Created by IntelliJ IDEA. @@ -118,6 +100,9 @@ public class DuplicateQualsWalker extends DuplicateWalker, Q @Argument(fullName="combinedQuals", shortName="combinedQuals", required=false, doc="Combine and assess pairwise base qualities") public boolean COMBINE_QUALS = false; + @Argument(fullName="combineAllDups", shortName="combineAllDups", required=false, defaultValue="false", doc="Combine and assess pairwise base qualities") + public boolean COMBINE_ALL_DUPS; + final boolean DEBUG = false; final private boolean ACTUALLY_DO_WORK = true; @@ -146,18 +131,17 @@ public class DuplicateQualsWalker extends DuplicateWalker, Q return pairwiseComps; if ( COMBINE_QUALS ) { - Pair combinedReads = combinedReadPair( duplicateReads ); + Pair combinedReads = DupUtils.combinedReadPair( duplicateReads ); if ( combinedReads != null ) { SAMRecord combined1 = combinedReads.first; SAMRecord combined2 = combinedReads.second; addPairwiseMatches( pairwiseComps, combined1, combined2 ); } - } - else { + } else { int nComparisons = 0; for ( SAMRecord read1 : duplicateReads ) { for ( SAMRecord read2 : duplicateReads ) { - if ( usableDuplicate(read1, read2) ) { + if ( DupUtils.usableDuplicate(read1, read2) ) { nComparisons++; addPairwiseMatches( pairwiseComps, read1, read2 ); if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET ) @@ -169,10 +153,6 @@ public class DuplicateQualsWalker extends DuplicateWalker, Q return pairwiseComps; } - - private boolean usableDuplicate( SAMRecord read1, SAMRecord read2 ) { - return read1 != read2 && read1.getReadLength() == read2.getReadLength(); - } private List addPairwiseMatches(List comps, SAMRecord read1, SAMRecord read2 ) { @@ -191,94 +171,4 @@ public class DuplicateQualsWalker extends DuplicateWalker, Q return comps; } - - private Pair combinedReadPair( List duplicateReads ) { - if ( duplicateReads.size() < 4 ) - return null; - - SAMRecord c1 = combineDuplicates(duplicateReads.get(0),duplicateReads.get(1)); - SAMRecord c2 = combineDuplicates(duplicateReads.get(2),duplicateReads.get(3)); - return new Pair(c1, c2); - } - - private SAMRecord sample3rdRead( List duplicateReads, SAMRecord read1, SAMRecord read2 ) { - if ( duplicateReads.size() <= 2 ) { - // no third unique read is available - return null; - } else { - for ( SAMRecord read3 : duplicateReads ) { - if ( usableDuplicate(read1, read3) && usableDuplicate(read2, read3) ) - return read3; - } - - return null; - } - } - - private SAMRecord tmpCopyRead(SAMRecord read) { - SAMRecord copy = new SAMRecord(read.getHeader()); - copy.setReadName(read.getReadName()); - //copy.setReadString(final String value) { - copy.setReadBases(read.getReadBases()); - copy.setBaseQualities(read.getBaseQualities()); - copy.setReferenceName(read.getReferenceName()); - copy.setReferenceIndex(read.getReferenceIndex()); - copy.setMateReferenceName(read.getMateReferenceName()); - copy.setMateReferenceIndex(read.getMateReferenceIndex()); - copy.setAlignmentStart(read.getAlignmentStart()); - //copy.setAlignmentEnd(read.getAlignmentEnd()); - copy.setMateAlignmentStart(read.getMateAlignmentStart()); - copy.setInferredInsertSize(read.getInferredInsertSize()); - copy.setMappingQuality(read.getMappingQuality()); - copy.setCigar(read.getCigar()); - copy.setFlags(copy.getFlags()); - - return copy; - } - - private SAMRecord combineDuplicates(SAMRecord read1, SAMRecord read2) { - byte[] read1Bases = read1.getReadBases(); - byte[] read1Quals = read1.getBaseQualities(); - byte[] read2Bases = read2.getReadBases(); - byte[] read2Quals = read2.getBaseQualities(); - - byte[] bases = new byte[read1Bases.length]; - byte[] quals = new byte[read1Bases.length]; - - SAMRecord c = tmpCopyRead(read1); - for ( int i = 0; i < read1Bases.length; i++) { - byte base1 = read1Bases[i]; - byte base2 = read2Bases[i]; - byte qual1 = read1Quals[i]; - byte qual2 = read2Quals[i]; - final double p1 = QualityUtils.qualToProb(qual1); - final double p2 = QualityUtils.qualToProb(qual2); - - double pc; - byte basec; - - if ( base1 == base2 ) { - // agreement - basec = base1; - pc = 1 - (1 - p1) * (1 - p2); - } else { - // disagreement - basec = p1 > p2 ? base1 : base2; - pc = p1 > p2 ? p1 : p2; - //pc = 0; - } - - bases[i] = basec; - quals[i] = QualityUtils.probToQual(pc, 0.0); - - if ( DEBUG ) - logger.debug(String.format("Combining %s (Q%2d) with %s (Q%2d) -> %s (Q%2d)%s%n", - (char)base1, qual1, (char)base2, qual2, (char)bases[i], quals[i], - base1 == base2 ? "" : " [MISMATCH]")); - } - c.setReadBases(bases); - c.setBaseQualities(quals); - - return c; - } } \ No newline at end of file