System for traversing duplicate reads, along with a walker to compute quality scores among duplicates and a smarter method to combine quality scores across duplicates -- v1

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@624 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-05-07 18:06:02 +00:00
parent 71e8f47a6c
commit 2204be43eb
4 changed files with 165 additions and 130 deletions

View File

@ -15,6 +15,7 @@ import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import java.io.File;
@ -89,8 +90,9 @@ public class TraverseDuplicates extends TraversalEngine {
return l;
}
private List<SAMRecord> collectDuplicates(List<SAMRecord> reads) {
ArrayList<SAMRecord> dups = new ArrayList<SAMRecord>();
private Pair<List<SAMRecord>, List<SAMRecord>> splitDuplicates(List<SAMRecord> reads) {
List<SAMRecord> uniques = new ArrayList<SAMRecord>();
List<SAMRecord> dups = new ArrayList<SAMRecord>();
// find the first duplicate
SAMRecord key = null;
@ -104,7 +106,7 @@ public class TraverseDuplicates extends TraversalEngine {
}
// At this point, there are two possibilities, we have found at least one dup or not
//System.out.printf("Key is %s%n", key);
// if it's a dup, add it to the dups list, otherwise add it to the uniques list
if ( key != null ) {
final GenomeLoc keyLoc = new GenomeLoc(key);
final GenomeLoc keyMateLoc = new GenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
@ -117,11 +119,15 @@ public class TraverseDuplicates extends TraversalEngine {
// we are at the same position as the dup and have the same mat pos, it's a dup
if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s%n", read));
dups.add(read);
} else {
uniques.add(read);
}
}
} else {
uniques = reads;
}
return dups;
return new Pair<List<SAMRecord>, List<SAMRecord>>(uniques, dups);
}
/**
@ -146,9 +152,13 @@ public class TraverseDuplicates extends TraversalEngine {
for (SAMRecord read: iter) {
// get the genome loc from the read
GenomeLoc site = new GenomeLoc(read);
logger.debug(String.format("*** TraverseDuplicates.traverse at %s", site));
List<SAMRecord> reads = readsAtLoc(read, iter);
List<SAMRecord> duplicateReads = collectDuplicates(reads);
Pair<List<SAMRecord>, List<SAMRecord>> split = splitDuplicates(reads);
List<SAMRecord> uniqueReads = split.getFirst();
List<SAMRecord> duplicateReads = split.getSecond();
logger.debug(String.format("*** TraverseDuplicates.traverse at %s has %d unique and %d duplicate reads",
site, uniqueReads.size(), duplicateReads.size()));
// Jump forward in the reference to this locus location
LocusContext locus = new LocusContext(site, duplicateReads, Arrays.asList(0));
@ -161,12 +171,17 @@ public class TraverseDuplicates extends TraversalEngine {
byte[] refBases = new byte[0];
final boolean keepMeP = dupWalker.filter(site, refBases, locus, duplicateReads);
if (keepMeP) {
M x = dupWalker.map(site, refBases, locus, duplicateReads);
sum = dupWalker.reduce(x, sum);
if ( dupWalker.mapUniqueReadsTooP() ) {
// Send each unique read to the map function
for ( SAMRecord unique : uniqueReads ) {
List<SAMRecord> l = Arrays.asList(unique);
sum = mapOne(dupWalker, l, site, refBases, locus, sum);
}
}
if ( duplicateReads.size() > 0 )
sum = mapOne(dupWalker, duplicateReads, site, refBases, locus, sum);
printProgress("dups", site);
if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) {
@ -213,7 +228,20 @@ public class TraverseDuplicates extends TraversalEngine {
return result;
}
}
public <M, T> T mapOne(DuplicateWalker<M, T> dupWalker,
List<SAMRecord> readSet,
GenomeLoc site,
byte[] refBases,
LocusContext locus,
T sum) {
final boolean keepMeP = dupWalker.filter(site, refBases, locus, readSet);
if (keepMeP) {
M x = dupWalker.map(site, refBases, locus, readSet);
sum = dupWalker.reduce(x, sum);
}
return sum;
}
// --------------------------------------------------------------------------------------------------------------
//

View File

@ -29,6 +29,15 @@ public abstract class DuplicateWalker<MapType, ReduceType> extends Walker<MapTyp
public boolean requiresReads() { return true; }
public boolean cannotHandleReads() { return false; }
/**
* Called by the traversal engine to decide whether to send non-duplicates as lists of
* singleton reads to the map function. By default it's false.
*
* @return true if you want to see non duplicates during the traversal
*/
public boolean mapUniqueReadsTooP() { return false; }
// Map over the org.broadinstitute.sting.gatk.LocusContext
public abstract MapType map(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> duplicateReads);

View File

@ -0,0 +1,108 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.duplicates.DuplicateComp;
import org.broadinstitute.sting.utils.duplicates.DupUtils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.List;
import java.util.ArrayList;
import java.io.PrintStream;
import java.io.File;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMFileWriterFactory;
import net.sf.samtools.SAMFileHeader;
/**
* Created by IntelliJ IDEA.
* User: mdepristo
* Date: Feb 22, 2009
* Time: 2:52:28 PM
* To change this template use File | Settings | File Templates.
*/
public class CombineDuplicatesWalker extends DuplicateWalker<SAMRecord, SAMFileWriter> {
@Argument(fullName="outputBAM", shortName="outputBAM", required=false, defaultValue="", doc="BAM File to write combined duplicates to")
public String outputFilename;
@Argument(fullName="includeUniqueReads", shortName="includeUniqueReads", required=false, defaultValue="true", doc="If true, also writes out non-duplicate reads in file")
public boolean INCLUDE_UNIQUE_READS;
@Argument(fullName="maxQ", shortName="maxQ", required=false, defaultValue="50",
doc="The maximum Q score allowed for combined reads, reflects the background error rate giving rise to perfect bases that don't correspond to the reference")
public int MAX_QUALITY_SCORE;
final boolean DEBUG = false;
public boolean mapUniqueReadsTooP() {
return INCLUDE_UNIQUE_READS;
}
// -----------------------------------------------------------------------------------------------
// Standard i/o reduce
//
public void onTraversalDone(SAMFileWriter output) {
if ( output != null ) {
output.close();
}
}
public SAMFileWriter reduceInit() {
if ( outputFilename != null ) { // ! outputFile.equals("") ) {
SAMFileWriterFactory fact = new SAMFileWriterFactory();
SAMFileHeader header = this.getToolkit().getSamReader().getFileHeader();
return fact.makeBAMWriter(header, true, new File(outputFilename));
}
else {
return null;
}
}
/**
*
*
*/
public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) {
if ( output != null ) {
output.addAlignment(read);
} else {
out.println(read.format());
}
return output;
}
/**
* Build a combined read given the input list of non-unique reads. If there's just one read in the
* set, it's considered unique and returned. If there's more than one, the N-way combine
* duplicate function is invoked.
*
* @param loc
* @param refBases
* @param context
* @param duplicateReads
* @return
*/
public SAMRecord map(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> duplicateReads) {
//logger.info(String.format("%s has %d duplicates%n", loc, duplicateReads.size()));
SAMRecord combinedRead = null;
if ( duplicateReads.size() == 1 ) {
// we are a unique read
combinedRead = duplicateReads.get(0);
} else {
// actually call the combine function
//for (SAMRecord read : duplicateReads ) {
// System.out.printf("Read %s%n", read.format());
//}
combinedRead = DupUtils.combineDuplicates(duplicateReads, MAX_QUALITY_SCORE);
}
return combinedRead;
}
}

View File

@ -1,20 +1,17 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.duplicates.DuplicateComp;
import org.broadinstitute.sting.utils.duplicates.DupUtils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.List;
import java.util.ArrayList;
import java.io.PrintStream;
import java.io.FileNotFoundException;
import net.sf.samtools.SAMRecord;
@ -68,7 +65,7 @@ class QualityTracker {
}
public void inc(DuplicateComp dc) {
inc(dc.qLarger, dc.qSmaller, dc.mismatchP);
inc(dc.getQLarger(), dc.getQSmaller(), dc.isMismatchP());
}
public void printToStream(PrintStream out, boolean filterUnobserved) {
@ -85,21 +82,6 @@ class QualityTracker {
}
}
class DuplicateComp {
int qLarger;
int qSmaller;
boolean mismatchP;
public DuplicateComp( int b1Q, int b2Q, boolean mismatchP ) {
qLarger = Math.max(b1Q, b2Q);
qSmaller = Math.min(b1Q, b2Q);
this.mismatchP = mismatchP;
}
public String toString() {
return String.format("%d %d %b", qLarger, qSmaller, mismatchP);
}
}
/**
* Created by IntelliJ IDEA.
@ -118,6 +100,9 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
@Argument(fullName="combinedQuals", shortName="combinedQuals", required=false, doc="Combine and assess pairwise base qualities")
public boolean COMBINE_QUALS = false;
@Argument(fullName="combineAllDups", shortName="combineAllDups", required=false, defaultValue="false", doc="Combine and assess pairwise base qualities")
public boolean COMBINE_ALL_DUPS;
final boolean DEBUG = false;
final private boolean ACTUALLY_DO_WORK = true;
@ -146,18 +131,17 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
return pairwiseComps;
if ( COMBINE_QUALS ) {
Pair<SAMRecord, SAMRecord> combinedReads = combinedReadPair( duplicateReads );
Pair<SAMRecord, SAMRecord> combinedReads = DupUtils.combinedReadPair( duplicateReads );
if ( combinedReads != null ) {
SAMRecord combined1 = combinedReads.first;
SAMRecord combined2 = combinedReads.second;
addPairwiseMatches( pairwiseComps, combined1, combined2 );
}
}
else {
} else {
int nComparisons = 0;
for ( SAMRecord read1 : duplicateReads ) {
for ( SAMRecord read2 : duplicateReads ) {
if ( usableDuplicate(read1, read2) ) {
if ( DupUtils.usableDuplicate(read1, read2) ) {
nComparisons++;
addPairwiseMatches( pairwiseComps, read1, read2 );
if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET )
@ -169,10 +153,6 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
return pairwiseComps;
}
private boolean usableDuplicate( SAMRecord read1, SAMRecord read2 ) {
return read1 != read2 && read1.getReadLength() == read2.getReadLength();
}
private List<DuplicateComp> addPairwiseMatches(List<DuplicateComp> comps,
SAMRecord read1, SAMRecord read2 ) {
@ -191,94 +171,4 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
return comps;
}
private Pair<SAMRecord, SAMRecord> combinedReadPair( List<SAMRecord> duplicateReads ) {
if ( duplicateReads.size() < 4 )
return null;
SAMRecord c1 = combineDuplicates(duplicateReads.get(0),duplicateReads.get(1));
SAMRecord c2 = combineDuplicates(duplicateReads.get(2),duplicateReads.get(3));
return new Pair<SAMRecord, SAMRecord>(c1, c2);
}
private SAMRecord sample3rdRead( List<SAMRecord> duplicateReads, SAMRecord read1, SAMRecord read2 ) {
if ( duplicateReads.size() <= 2 ) {
// no third unique read is available
return null;
} else {
for ( SAMRecord read3 : duplicateReads ) {
if ( usableDuplicate(read1, read3) && usableDuplicate(read2, read3) )
return read3;
}
return null;
}
}
private SAMRecord tmpCopyRead(SAMRecord read) {
SAMRecord copy = new SAMRecord(read.getHeader());
copy.setReadName(read.getReadName());
//copy.setReadString(final String value) {
copy.setReadBases(read.getReadBases());
copy.setBaseQualities(read.getBaseQualities());
copy.setReferenceName(read.getReferenceName());
copy.setReferenceIndex(read.getReferenceIndex());
copy.setMateReferenceName(read.getMateReferenceName());
copy.setMateReferenceIndex(read.getMateReferenceIndex());
copy.setAlignmentStart(read.getAlignmentStart());
//copy.setAlignmentEnd(read.getAlignmentEnd());
copy.setMateAlignmentStart(read.getMateAlignmentStart());
copy.setInferredInsertSize(read.getInferredInsertSize());
copy.setMappingQuality(read.getMappingQuality());
copy.setCigar(read.getCigar());
copy.setFlags(copy.getFlags());
return copy;
}
private SAMRecord combineDuplicates(SAMRecord read1, SAMRecord read2) {
byte[] read1Bases = read1.getReadBases();
byte[] read1Quals = read1.getBaseQualities();
byte[] read2Bases = read2.getReadBases();
byte[] read2Quals = read2.getBaseQualities();
byte[] bases = new byte[read1Bases.length];
byte[] quals = new byte[read1Bases.length];
SAMRecord c = tmpCopyRead(read1);
for ( int i = 0; i < read1Bases.length; i++) {
byte base1 = read1Bases[i];
byte base2 = read2Bases[i];
byte qual1 = read1Quals[i];
byte qual2 = read2Quals[i];
final double p1 = QualityUtils.qualToProb(qual1);
final double p2 = QualityUtils.qualToProb(qual2);
double pc;
byte basec;
if ( base1 == base2 ) {
// agreement
basec = base1;
pc = 1 - (1 - p1) * (1 - p2);
} else {
// disagreement
basec = p1 > p2 ? base1 : base2;
pc = p1 > p2 ? p1 : p2;
//pc = 0;
}
bases[i] = basec;
quals[i] = QualityUtils.probToQual(pc, 0.0);
if ( DEBUG )
logger.debug(String.format("Combining %s (Q%2d) with %s (Q%2d) -> %s (Q%2d)%s%n",
(char)base1, qual1, (char)base2, qual2, (char)bases[i], quals[i],
base1 == base2 ? "" : " [MISMATCH]"));
}
c.setReadBases(bases);
c.setBaseQualities(quals);
return c;
}
}