diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java index e4e74e367..f00a681e4 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java @@ -176,12 +176,12 @@ public class TraverseDuplicates extends TraversalEngine { // Send each unique read to the map function for ( SAMRecord unique : uniqueReads ) { List l = Arrays.asList(unique); - sum = mapOne(dupWalker, l, site, refBases, locus, sum); + sum = mapOne(dupWalker, uniqueReads, l, site, refBases, locus, sum); } } if ( duplicateReads.size() > 0 ) - sum = mapOne(dupWalker, duplicateReads, site, refBases, locus, sum); + sum = mapOne(dupWalker, uniqueReads, duplicateReads, site, refBases, locus, sum); printProgress("dups", site); @@ -231,14 +231,15 @@ public class TraverseDuplicates extends TraversalEngine { } public T mapOne(DuplicateWalker dupWalker, - List readSet, + List uniqueReads, + List duplicateReads, GenomeLoc site, byte[] refBases, LocusContext locus, T sum) { - final boolean keepMeP = dupWalker.filter(site, refBases, locus, readSet); + final boolean keepMeP = dupWalker.filter(site, refBases, locus, uniqueReads, duplicateReads); if (keepMeP) { - M x = dupWalker.map(site, refBases, locus, readSet); + M x = dupWalker.map(site, refBases, locus, uniqueReads, duplicateReads); sum = dupWalker.reduce(x, sum); } return sum; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java index eb5bf53d8..bc40ee85f 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java @@ -17,20 +17,23 @@ import net.sf.samtools.SAMRecord; @Requires({DataSource.READS,DataSource.REFERENCE}) public abstract class DuplicateWalker extends Walker { // Do we actually want to operate on the context? - public boolean filter(GenomeLoc loc, byte[] refBases, LocusContext context, List duplicateReads) { + public boolean filter(GenomeLoc loc, byte[] refBases, LocusContext context, + List uniqueReads, + List duplicateReads) { return true; // We are keeping all the reads } /** * Called by the traversal engine to decide whether to send non-duplicates as lists of * singleton reads to the map function. By default it's false. - * + * * @return true if you want to see non duplicates during the traversal */ public boolean mapUniqueReadsTooP() { return false; } - // Map over the org.broadinstitute.sting.gatk.LocusContext - public abstract MapType map(GenomeLoc loc, byte[] refBases, LocusContext context, List duplicateReads); + public abstract MapType map(GenomeLoc loc, byte[] refBases, LocusContext context, + List uniqueReads, + List duplicateReads); // Given result of map function public abstract ReduceType reduceInit(); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ReplaceQuals.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ReplaceQuals.java new file mode 100755 index 000000000..5a95e2a9a --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ReplaceQuals.java @@ -0,0 +1,133 @@ +package org.broadinstitute.sting.playground.gatk.walkers; + +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.Pair; +import net.sf.samtools.*; +import edu.mit.broad.picard.reference.ReferenceSequence; + +import java.util.ArrayList; +import java.util.Random; +import java.util.HashMap; +import java.io.File; + +/** + * ReadErrorRateWalker assesses the error rate per read position ('cycle') by comparing the + * read to its home on the reference and noting the mismatch rate. It ignores reads with + * indels in them, treats high and low-quality references bases the same, and does not count + * ambiguous bases as mismatches. It's also thread-safe, so you can process a slew of reads + * in short order. + * + * @author Kiran Garimella + */ +public class ReplaceQuals extends ReadWalker { + @Argument(shortName="inputQualsBAM",doc="BAM files containing qualities to be replaced",required=true) + public String inputQualsBAM; + + @Argument(shortName="outputBAM", required=false, doc="output BAM file for reads with replaced quals") + public String outputFilename = null; + + public int MAX_READS_TO_LOAD = -1; + + private HashMap> readNameToPairs; + private int READ_PRINT_MOD = 100000; + private final boolean DEBUG = false; + + public void initialize() { + readNameToPairs = new HashMap>(); + + SAMFileReader samReader = new SAMFileReader(new File(inputQualsBAM)); + samReader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT); + + int nReads = 0; + logger.info("Starting to read inputQualsBAM = " + inputQualsBAM); + for ( SAMRecord read : samReader ) { + //System.out.printf("READ is %s%n", read.format()); + + final String name = read.getReadName(); + Pair binding = readNameToPairs.containsKey(name) ? readNameToPairs.get(name) : new Pair(null, null); + if ( read.getFirstOfPairFlag() ) { + binding.first = read; + } else { + binding.second = read; + } + readNameToPairs.put(name, binding); + + if ( ++nReads % READ_PRINT_MOD == 0 ) { + logger.info(String.format(" Read %d reads so far...", nReads)); + } + + if ( nReads > MAX_READS_TO_LOAD && MAX_READS_TO_LOAD != -1 ) + break; + } + + logger.info("Done reading input BAM"); + } + + /** + * + */ + public SAMRecord map(char[] ref, SAMRecord read) { + final String name = read.getReadName(); + + if ( readNameToPairs.containsKey(name) ) { + Pair binding = readNameToPairs.get(name); + SAMRecord qRead = read.getFirstOfPairFlag() ? binding.first : binding.second; + if (qRead != null) { + + if ( DEBUG ) { + System.out.printf("Replacing read %s quals with %s%n", read.getReadName(), qRead.getReadName()); + System.out.printf("%s%n", read.getReadName()); + System.out.printf("%s%n", qRead.getReadName()); + System.out.printf("%s%n", read.getReadString()); + System.out.printf("%s%n", qRead.getReadString()); + System.out.printf("%s%n", read.getBaseQualityString()); + System.out.printf("%s%n", qRead.getBaseQualityString()); + + //if (! read.getReadString().equals(qRead.getReadString())) + // throw new RuntimeException(String.format("BUG: equating %s and %s but bases are different", read.getReadName(), qRead.getReadName())); + } + + read.setBaseQualities(qRead.getBaseQualities()); + } + } + + return read; + } + + // ----------------------------------------------------------------------------------------------- + // Standard i/o reduce + // + public void onTraversalDone(SAMFileWriter output) { + if ( output != null ) { + output.close(); + } + } + + public SAMFileWriter reduceInit() { + if ( outputFilename != null ) { // ! outputBamFile.equals("") ) { + SAMFileWriterFactory fact = new SAMFileWriterFactory(); + SAMFileHeader header = this.getToolkit().getEngine().getSAMHeader(); + return fact.makeBAMWriter(header, true, new File(outputFilename)); + } + else { + return null; + } + } + + /** + * + * + */ + public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) { + if ( output != null ) { + output.addAlignment(read); + } else { + out.println(read.format()); + } + + return output; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CombineDuplicatesWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CombineDuplicatesWalker.java similarity index 90% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/CombineDuplicatesWalker.java rename to java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CombineDuplicatesWalker.java index eb2f1d81a..50d8db818 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CombineDuplicatesWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CombineDuplicatesWalker.java @@ -48,9 +48,9 @@ public class CombineDuplicatesWalker extends DuplicateWalker duplicateReads) { - //logger.info(String.format("%s has %d duplicates%n", loc, duplicateReads.size())); + public SAMRecord map(GenomeLoc loc, byte[] refBases, LocusContext context, + List uniqueReads, + List duplicateReads) { + //logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size())); + SAMRecord combinedRead = null; if ( duplicateReads.size() == 1 && ! duplicateReads.get(0).getDuplicateReadFlag() ) { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/DuplicateQualsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/DuplicateQualsWalker.java similarity index 53% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/DuplicateQualsWalker.java rename to java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/DuplicateQualsWalker.java index ed21a02e2..f3c7472c4 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/DuplicateQualsWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/DuplicateQualsWalker.java @@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.walkers.DuplicateWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.duplicates.DuplicateComp; import org.broadinstitute.sting.utils.duplicates.DupUtils; import org.broadinstitute.sting.utils.cmdLine.Argument; @@ -58,51 +59,63 @@ class QualityTracker { } } - public void inc(int b1Q, int b2Q, boolean mismatchP) { + public void inc(int b1Qi, int b2Qi, boolean mismatchP, boolean orderDependent) { + int b1Q = orderDependent ? b1Qi : Math.max(b1Qi, b2Qi); + int b2Q = orderDependent ? b2Qi : Math.min(b1Qi, b2Qi); + if ( b1Q > MAX_QUAL_SCORE ) throw new RuntimeException("Unexpectedly large base quality " + b1Q); if ( b2Q > MAX_QUAL_SCORE ) throw new RuntimeException("Unexpectedly large base quality " + b2Q); + mismatchesByQ[b1Q][b2Q].inc(mismatchP); } - public void inc(DuplicateComp dc) { - inc(dc.getQLarger(), dc.getQSmaller(), dc.isMismatchP()); + public void inc(DuplicateComp dc, boolean orderDependent) { + inc(dc.getQLarger(), dc.getQSmaller(), dc.isMismatchP(), orderDependent); } + public int probMismatchQ1Q2(int q1, int q2) { + double e1 = 1 - QualityUtils.qualToProb(q1); + double e2 = 1 - QualityUtils.qualToProb(q2); + double eMM = e1 * (1 - e2) + (1 - e1) * e2 - 1/3 * e1 * e2; + return QualityUtils.probToQual(1 - eMM, 0.0); + } + public void printToStream(PrintStream out, boolean filterUnobserved) { - out.printf("Q1\tQ2\t%s%n", mismatchesByQ[0][0].headerString()); + out.printf("Q1\tQ2\tQmin\t%s%n", mismatchesByQ[0][0].headerString()); for ( int i = 0; i < MAX_QUAL_SCORE; i++ ) { for ( int j = 0; j < MAX_QUAL_SCORE; j++ ) { MismatchCounter mc = mismatchesByQ[i][j]; //System.out.printf("MC = %s%n", mc); if ( filterUnobserved && mc.nObs == 0 ) continue; - out.printf("%d\t%d\t%s\t%n", i, j, mc.toString()); + out.printf("%d\t%d\t%d\t%s\t%n", i, j, probMismatchQ1Q2(i,j), mc.toString()); } } } } - -/** - * Created by IntelliJ IDEA. - * User: mdepristo - * Date: Feb 22, 2009 - * Time: 2:52:28 PM - * To change this template use File | Settings | File Templates. - */ public class DuplicateQualsWalker extends DuplicateWalker, QualityTracker> { - @Argument(fullName="filterUnobservedQuals", shortName="filterUnobservedQuals", required=false, doc="Show only quality bins with at least one observation in the data") + @Argument(fullName="filterUnobservedQuals", required=false, doc="Show only quality bins with at least one observation in the data") public boolean FILTER_UNOBSERVED_QUALS = false; - @Argument(fullName="maxPairwiseCompsPerDupSet", shortName="maxPairwiseCompsPerDupSet", required=false, doc="Maximumize number of pairwise comparisons to perform among duplicate read sets") + @Argument(fullName="maxPairwiseCompsPerDupSet", required=false, doc="Maximumize number of pairwise comparisons to perform among duplicate read sets") public int MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET = 100; - @Argument(fullName="combinedQuals", shortName="combinedQuals", required=false, doc="Combine and assess pairwise base qualities") + @Argument(fullName="combinedQuals", required=false, doc="Combine and assess pairwise base qualities") public boolean COMBINE_QUALS = false; - @Argument(fullName="combineAllDups", shortName="combineAllDups", required=false, doc="Combine and assess pairwise base qualities") + @Argument(fullName="combineAllDups", required=false, doc="Combine and assess pairwise base qualities") public boolean COMBINE_ALL_DUPS = false; + @Argument(fullName="orderDependent", required=false, doc="") + public boolean orderDependent = false; + + @Argument(fullName="compareToUniqueReads", required=false, doc="If true, then we will compare only to unique (i.e., non-duplicated molecules) at the same duplicate site") + public boolean compareToUniqueReads = false; + + @Argument(fullName="comparePairToSingleton", required=false, doc="If true, then we will compare a combined dup to a random other read in the duplicate set, not a combined pair itself") + public boolean comparePairToSingleton = false; + final boolean DEBUG = false; final private boolean ACTUALLY_DO_WORK = true; @@ -116,15 +129,17 @@ public class DuplicateQualsWalker extends DuplicateWalker, Q public QualityTracker reduce(List dupComps, QualityTracker tracker) { for ( DuplicateComp dc : dupComps ) { - tracker.inc(dc); + tracker.inc(dc, orderDependent); } return tracker; } // Print out data for regression - public List map(GenomeLoc loc, byte[] refBases, LocusContext context, List duplicateReads) { - //out.printf("%s has %d duplicates%n", loc, duplicateReads.size()); + public List map(GenomeLoc loc, byte[] refBases, LocusContext context, + List uniqueReads, + List duplicateReads) { + //logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size())); List pairwiseComps = new ArrayList(); if ( ! ACTUALLY_DO_WORK ) @@ -135,15 +150,21 @@ public class DuplicateQualsWalker extends DuplicateWalker, Q if ( combinedReads != null ) { SAMRecord combined1 = combinedReads.first; SAMRecord combined2 = combinedReads.second; - addPairwiseMatches( pairwiseComps, combined1, combined2 ); + + if ( comparePairToSingleton ) + pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, duplicateReads.get(2), uniqueReads ); + else + pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, combined2, uniqueReads ); } } else { int nComparisons = 0; for ( SAMRecord read1 : duplicateReads ) { for ( SAMRecord read2 : duplicateReads ) { - if ( DupUtils.usableDuplicate(read1, read2) ) { + if ( read1.hashCode() < read2.hashCode() && DupUtils.usableDuplicate(read1, read2) ) { + // the hashcode insures we don't do A vs. B and B vs. A + //System.out.printf("Comparing %s against %s%n", read1, read2); nComparisons++; - addPairwiseMatches( pairwiseComps, read1, read2 ); + pairwiseComps = addPairwiseMatches( pairwiseComps, read1, read2, uniqueReads ); if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET ) break; } @@ -155,7 +176,32 @@ public class DuplicateQualsWalker extends DuplicateWalker, Q } private List addPairwiseMatches(List comps, - SAMRecord read1, SAMRecord read2 ) { + SAMRecord read1, SAMRecord read2, + List uniqueReads ) { + if ( compareToUniqueReads ) { + // we want to compare to a read in the unique read set + if ( uniqueReads.size() > 0 ) { // there's actually something to compare to + SAMRecord uniqueRead = uniqueReads.get(0); // might as well get the first one + return pairwiseMatches(comps, read1, uniqueRead); + } else { + return comps; + } + } else { + // default, just do read1 vs. read2 + return pairwiseMatches(comps, read1, read2); + } + } + + /** + * Calculates the pairwise mismatches between reads read1 and read2 and adds the result to the comps list. + * Doesn't contain any logic deciding what to compare, just does read1 and read2 + * + * @param comps + * @param read1 + * @param read2 + * @return + */ + private List pairwiseMatches(List comps, SAMRecord read1, SAMRecord read2 ) { byte[] read1Bases = read1.getReadBases(); byte[] read1Quals = read1.getBaseQualities(); byte[] read2Bases = read2.getReadBases(); @@ -164,7 +210,7 @@ public class DuplicateQualsWalker extends DuplicateWalker, Q for ( int i = 0; i < read1Bases.length; i++) { byte qual1 = read1Quals[i]; byte qual2 = read2Quals[i]; - boolean mismatchP = read1Bases[i] != read2Bases[i]; + boolean mismatchP = ! BaseUtils.basesAreEqual(read1Bases[i], read2Bases[i]); DuplicateComp dc = new DuplicateComp(qual1, qual2, mismatchP); comps.add(dc); }