More improvements to the duplicate quality combiner, making progress towards a clean system

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@788 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-05-21 22:26:57 +00:00
parent 04e51c8d1d
commit 30c63daf89
5 changed files with 224 additions and 38 deletions

View File

@ -176,12 +176,12 @@ public class TraverseDuplicates extends TraversalEngine {
// Send each unique read to the map function
for ( SAMRecord unique : uniqueReads ) {
List<SAMRecord> l = Arrays.asList(unique);
sum = mapOne(dupWalker, l, site, refBases, locus, sum);
sum = mapOne(dupWalker, uniqueReads, l, site, refBases, locus, sum);
}
}
if ( duplicateReads.size() > 0 )
sum = mapOne(dupWalker, duplicateReads, site, refBases, locus, sum);
sum = mapOne(dupWalker, uniqueReads, duplicateReads, site, refBases, locus, sum);
printProgress("dups", site);
@ -231,14 +231,15 @@ public class TraverseDuplicates extends TraversalEngine {
}
public <M, T> T mapOne(DuplicateWalker<M, T> dupWalker,
List<SAMRecord> readSet,
List<SAMRecord> uniqueReads,
List<SAMRecord> duplicateReads,
GenomeLoc site,
byte[] refBases,
LocusContext locus,
T sum) {
final boolean keepMeP = dupWalker.filter(site, refBases, locus, readSet);
final boolean keepMeP = dupWalker.filter(site, refBases, locus, uniqueReads, duplicateReads);
if (keepMeP) {
M x = dupWalker.map(site, refBases, locus, readSet);
M x = dupWalker.map(site, refBases, locus, uniqueReads, duplicateReads);
sum = dupWalker.reduce(x, sum);
}
return sum;

View File

@ -17,20 +17,23 @@ import net.sf.samtools.SAMRecord;
@Requires({DataSource.READS,DataSource.REFERENCE})
public abstract class DuplicateWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
// Do we actually want to operate on the context?
public boolean filter(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> duplicateReads) {
public boolean filter(GenomeLoc loc, byte[] refBases, LocusContext context,
List<SAMRecord> uniqueReads,
List<SAMRecord> duplicateReads) {
return true; // We are keeping all the reads
}
/**
* Called by the traversal engine to decide whether to send non-duplicates as lists of
* singleton reads to the map function. By default it's false.
*
*
* @return true if you want to see non duplicates during the traversal
*/
public boolean mapUniqueReadsTooP() { return false; }
// Map over the org.broadinstitute.sting.gatk.LocusContext
public abstract MapType map(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> duplicateReads);
public abstract MapType map(GenomeLoc loc, byte[] refBases, LocusContext context,
List<SAMRecord> uniqueReads,
List<SAMRecord> duplicateReads);
// Given result of map function
public abstract ReduceType reduceInit();

View File

@ -0,0 +1,133 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Pair;
import net.sf.samtools.*;
import edu.mit.broad.picard.reference.ReferenceSequence;
import java.util.ArrayList;
import java.util.Random;
import java.util.HashMap;
import java.io.File;
/**
* ReadErrorRateWalker assesses the error rate per read position ('cycle') by comparing the
* read to its home on the reference and noting the mismatch rate. It ignores reads with
* indels in them, treats high and low-quality references bases the same, and does not count
* ambiguous bases as mismatches. It's also thread-safe, so you can process a slew of reads
* in short order.
*
* @author Kiran Garimella
*/
public class ReplaceQuals extends ReadWalker<SAMRecord, SAMFileWriter> {
@Argument(shortName="inputQualsBAM",doc="BAM files containing qualities to be replaced",required=true)
public String inputQualsBAM;
@Argument(shortName="outputBAM", required=false, doc="output BAM file for reads with replaced quals")
public String outputFilename = null;
public int MAX_READS_TO_LOAD = -1;
private HashMap<String, Pair<SAMRecord, SAMRecord>> readNameToPairs;
private int READ_PRINT_MOD = 100000;
private final boolean DEBUG = false;
public void initialize() {
readNameToPairs = new HashMap<String, Pair<SAMRecord, SAMRecord>>();
SAMFileReader samReader = new SAMFileReader(new File(inputQualsBAM));
samReader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT);
int nReads = 0;
logger.info("Starting to read inputQualsBAM = " + inputQualsBAM);
for ( SAMRecord read : samReader ) {
//System.out.printf("READ is %s%n", read.format());
final String name = read.getReadName();
Pair<SAMRecord, SAMRecord> binding = readNameToPairs.containsKey(name) ? readNameToPairs.get(name) : new Pair<SAMRecord, SAMRecord>(null, null);
if ( read.getFirstOfPairFlag() ) {
binding.first = read;
} else {
binding.second = read;
}
readNameToPairs.put(name, binding);
if ( ++nReads % READ_PRINT_MOD == 0 ) {
logger.info(String.format(" Read %d reads so far...", nReads));
}
if ( nReads > MAX_READS_TO_LOAD && MAX_READS_TO_LOAD != -1 )
break;
}
logger.info("Done reading input BAM");
}
/**
*
*/
public SAMRecord map(char[] ref, SAMRecord read) {
final String name = read.getReadName();
if ( readNameToPairs.containsKey(name) ) {
Pair<SAMRecord, SAMRecord> binding = readNameToPairs.get(name);
SAMRecord qRead = read.getFirstOfPairFlag() ? binding.first : binding.second;
if (qRead != null) {
if ( DEBUG ) {
System.out.printf("Replacing read %s quals with %s%n", read.getReadName(), qRead.getReadName());
System.out.printf("%s%n", read.getReadName());
System.out.printf("%s%n", qRead.getReadName());
System.out.printf("%s%n", read.getReadString());
System.out.printf("%s%n", qRead.getReadString());
System.out.printf("%s%n", read.getBaseQualityString());
System.out.printf("%s%n", qRead.getBaseQualityString());
//if (! read.getReadString().equals(qRead.getReadString()))
// throw new RuntimeException(String.format("BUG: equating %s and %s but bases are different", read.getReadName(), qRead.getReadName()));
}
read.setBaseQualities(qRead.getBaseQualities());
}
}
return read;
}
// -----------------------------------------------------------------------------------------------
// Standard i/o reduce
//
public void onTraversalDone(SAMFileWriter output) {
if ( output != null ) {
output.close();
}
}
public SAMFileWriter reduceInit() {
if ( outputFilename != null ) { // ! outputBamFile.equals("") ) {
SAMFileWriterFactory fact = new SAMFileWriterFactory();
SAMFileHeader header = this.getToolkit().getEngine().getSAMHeader();
return fact.makeBAMWriter(header, true, new File(outputFilename));
}
else {
return null;
}
}
/**
*
*
*/
public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) {
if ( output != null ) {
output.addAlignment(read);
} else {
out.println(read.format());
}
return output;
}
}

View File

@ -48,9 +48,9 @@ public class CombineDuplicatesWalker extends DuplicateWalker<SAMRecord, SAMFileW
}
public SAMFileWriter reduceInit() {
if ( outputFilename != null ) { // ! outputBamFile.equals("") ) {
if ( outputFilename != null ) {
SAMFileWriterFactory fact = new SAMFileWriterFactory();
SAMFileHeader header = this.getToolkit().getSamReader().getFileHeader();
SAMFileHeader header = this.getToolkit().getEngine().getSAMHeader();
return fact.makeBAMWriter(header, true, new File(outputFilename));
}
else {
@ -83,8 +83,11 @@ public class CombineDuplicatesWalker extends DuplicateWalker<SAMRecord, SAMFileW
* @param duplicateReads
* @return
*/
public SAMRecord map(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> duplicateReads) {
//logger.info(String.format("%s has %d duplicates%n", loc, duplicateReads.size()));
public SAMRecord map(GenomeLoc loc, byte[] refBases, LocusContext context,
List<SAMRecord> uniqueReads,
List<SAMRecord> duplicateReads) {
//logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size()));
SAMRecord combinedRead = null;
if ( duplicateReads.size() == 1 && ! duplicateReads.get(0).getDuplicateReadFlag() ) {

View File

@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.duplicates.DuplicateComp;
import org.broadinstitute.sting.utils.duplicates.DupUtils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
@ -58,51 +59,63 @@ class QualityTracker {
}
}
public void inc(int b1Q, int b2Q, boolean mismatchP) {
public void inc(int b1Qi, int b2Qi, boolean mismatchP, boolean orderDependent) {
int b1Q = orderDependent ? b1Qi : Math.max(b1Qi, b2Qi);
int b2Q = orderDependent ? b2Qi : Math.min(b1Qi, b2Qi);
if ( b1Q > MAX_QUAL_SCORE ) throw new RuntimeException("Unexpectedly large base quality " + b1Q);
if ( b2Q > MAX_QUAL_SCORE ) throw new RuntimeException("Unexpectedly large base quality " + b2Q);
mismatchesByQ[b1Q][b2Q].inc(mismatchP);
}
public void inc(DuplicateComp dc) {
inc(dc.getQLarger(), dc.getQSmaller(), dc.isMismatchP());
public void inc(DuplicateComp dc, boolean orderDependent) {
inc(dc.getQLarger(), dc.getQSmaller(), dc.isMismatchP(), orderDependent);
}
public int probMismatchQ1Q2(int q1, int q2) {
double e1 = 1 - QualityUtils.qualToProb(q1);
double e2 = 1 - QualityUtils.qualToProb(q2);
double eMM = e1 * (1 - e2) + (1 - e1) * e2 - 1/3 * e1 * e2;
return QualityUtils.probToQual(1 - eMM, 0.0);
}
public void printToStream(PrintStream out, boolean filterUnobserved) {
out.printf("Q1\tQ2\t%s%n", mismatchesByQ[0][0].headerString());
out.printf("Q1\tQ2\tQmin\t%s%n", mismatchesByQ[0][0].headerString());
for ( int i = 0; i < MAX_QUAL_SCORE; i++ ) {
for ( int j = 0; j < MAX_QUAL_SCORE; j++ ) {
MismatchCounter mc = mismatchesByQ[i][j];
//System.out.printf("MC = %s%n", mc);
if ( filterUnobserved && mc.nObs == 0 )
continue;
out.printf("%d\t%d\t%s\t%n", i, j, mc.toString());
out.printf("%d\t%d\t%d\t%s\t%n", i, j, probMismatchQ1Q2(i,j), mc.toString());
}
}
}
}
/**
* Created by IntelliJ IDEA.
* User: mdepristo
* Date: Feb 22, 2009
* Time: 2:52:28 PM
* To change this template use File | Settings | File Templates.
*/
public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, QualityTracker> {
@Argument(fullName="filterUnobservedQuals", shortName="filterUnobservedQuals", required=false, doc="Show only quality bins with at least one observation in the data")
@Argument(fullName="filterUnobservedQuals", required=false, doc="Show only quality bins with at least one observation in the data")
public boolean FILTER_UNOBSERVED_QUALS = false;
@Argument(fullName="maxPairwiseCompsPerDupSet", shortName="maxPairwiseCompsPerDupSet", required=false, doc="Maximumize number of pairwise comparisons to perform among duplicate read sets")
@Argument(fullName="maxPairwiseCompsPerDupSet", required=false, doc="Maximumize number of pairwise comparisons to perform among duplicate read sets")
public int MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET = 100;
@Argument(fullName="combinedQuals", shortName="combinedQuals", required=false, doc="Combine and assess pairwise base qualities")
@Argument(fullName="combinedQuals", required=false, doc="Combine and assess pairwise base qualities")
public boolean COMBINE_QUALS = false;
@Argument(fullName="combineAllDups", shortName="combineAllDups", required=false, doc="Combine and assess pairwise base qualities")
@Argument(fullName="combineAllDups", required=false, doc="Combine and assess pairwise base qualities")
public boolean COMBINE_ALL_DUPS = false;
@Argument(fullName="orderDependent", required=false, doc="")
public boolean orderDependent = false;
@Argument(fullName="compareToUniqueReads", required=false, doc="If true, then we will compare only to unique (i.e., non-duplicated molecules) at the same duplicate site")
public boolean compareToUniqueReads = false;
@Argument(fullName="comparePairToSingleton", required=false, doc="If true, then we will compare a combined dup to a random other read in the duplicate set, not a combined pair itself")
public boolean comparePairToSingleton = false;
final boolean DEBUG = false;
final private boolean ACTUALLY_DO_WORK = true;
@ -116,15 +129,17 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
public QualityTracker reduce(List<DuplicateComp> dupComps, QualityTracker tracker) {
for ( DuplicateComp dc : dupComps ) {
tracker.inc(dc);
tracker.inc(dc, orderDependent);
}
return tracker;
}
// Print out data for regression
public List<DuplicateComp> map(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> duplicateReads) {
//out.printf("%s has %d duplicates%n", loc, duplicateReads.size());
public List<DuplicateComp> map(GenomeLoc loc, byte[] refBases, LocusContext context,
List<SAMRecord> uniqueReads,
List<SAMRecord> duplicateReads) {
//logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size()));
List<DuplicateComp> pairwiseComps = new ArrayList<DuplicateComp>();
if ( ! ACTUALLY_DO_WORK )
@ -135,15 +150,21 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
if ( combinedReads != null ) {
SAMRecord combined1 = combinedReads.first;
SAMRecord combined2 = combinedReads.second;
addPairwiseMatches( pairwiseComps, combined1, combined2 );
if ( comparePairToSingleton )
pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, duplicateReads.get(2), uniqueReads );
else
pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, combined2, uniqueReads );
}
} else {
int nComparisons = 0;
for ( SAMRecord read1 : duplicateReads ) {
for ( SAMRecord read2 : duplicateReads ) {
if ( DupUtils.usableDuplicate(read1, read2) ) {
if ( read1.hashCode() < read2.hashCode() && DupUtils.usableDuplicate(read1, read2) ) {
// the hashcode insures we don't do A vs. B and B vs. A
//System.out.printf("Comparing %s against %s%n", read1, read2);
nComparisons++;
addPairwiseMatches( pairwiseComps, read1, read2 );
pairwiseComps = addPairwiseMatches( pairwiseComps, read1, read2, uniqueReads );
if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET )
break;
}
@ -155,7 +176,32 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
}
private List<DuplicateComp> addPairwiseMatches(List<DuplicateComp> comps,
SAMRecord read1, SAMRecord read2 ) {
SAMRecord read1, SAMRecord read2,
List<SAMRecord> uniqueReads ) {
if ( compareToUniqueReads ) {
// we want to compare to a read in the unique read set
if ( uniqueReads.size() > 0 ) { // there's actually something to compare to
SAMRecord uniqueRead = uniqueReads.get(0); // might as well get the first one
return pairwiseMatches(comps, read1, uniqueRead);
} else {
return comps;
}
} else {
// default, just do read1 vs. read2
return pairwiseMatches(comps, read1, read2);
}
}
/**
* Calculates the pairwise mismatches between reads read1 and read2 and adds the result to the comps list.
* Doesn't contain any logic deciding what to compare, just does read1 and read2
*
* @param comps
* @param read1
* @param read2
* @return
*/
private List<DuplicateComp> pairwiseMatches(List<DuplicateComp> comps, SAMRecord read1, SAMRecord read2 ) {
byte[] read1Bases = read1.getReadBases();
byte[] read1Quals = read1.getBaseQualities();
byte[] read2Bases = read2.getReadBases();
@ -164,7 +210,7 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
for ( int i = 0; i < read1Bases.length; i++) {
byte qual1 = read1Quals[i];
byte qual2 = read2Quals[i];
boolean mismatchP = read1Bases[i] != read2Bases[i];
boolean mismatchP = ! BaseUtils.basesAreEqual(read1Bases[i], read2Bases[i]);
DuplicateComp dc = new DuplicateComp(qual1, qual2, mismatchP);
comps.add(dc);
}