Initial version of ByDuplicates traversal, as well as a duplicate quality score estimator
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@576 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ff420f5f6f
commit
84dae06d5a
|
|
@ -11,9 +11,7 @@ import org.broadinstitute.sting.gatk.executive.MicroScheduler;
|
|||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.traversals.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWindowWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
|
|
@ -269,9 +267,14 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
}
|
||||
} else if ( my_walker instanceof LocusWindowWalker ) {
|
||||
this.engine = new TraverseByLocusWindows(INPUT_FILES, REF_FILE_ARG, rods);
|
||||
} else {
|
||||
} else if ( my_walker instanceof ReadWalker) {
|
||||
// we're a read walker
|
||||
this.engine = new TraverseByReads(INPUT_FILES, REF_FILE_ARG, rods);
|
||||
} else if ( my_walker instanceof DuplicateWalker) {
|
||||
// we're a duplicate walker
|
||||
this.engine = new TraverseDuplicates(INPUT_FILES, REF_FILE_ARG, rods);
|
||||
} else {
|
||||
throw new RuntimeException("Unexpected walker type: " + my_walker);
|
||||
}
|
||||
|
||||
// Prepare the sort ordering w.r.t. the sequence dictionary
|
||||
|
|
|
|||
|
|
@ -0,0 +1,289 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider;
|
||||
import org.broadinstitute.sting.gatk.dataSources.shards.ReadShard;
|
||||
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
import edu.mit.broad.picard.filter.FilteringIterator;
|
||||
import edu.mit.broad.picard.filter.SamRecordFilter;
|
||||
|
||||
/**
|
||||
*
|
||||
* User: aaron
|
||||
* Date: Apr 24, 2009
|
||||
* Time: 10:35:22 AM
|
||||
*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @author Mark DePristo
|
||||
* @version 0.1
|
||||
* @date Apr 29, 2009
|
||||
* <p/>
|
||||
* Class TraverseDuplicates
|
||||
* <p/>
|
||||
* This class handles traversing lists of duplicate reads in the new shardable style
|
||||
*/
|
||||
public class TraverseDuplicates extends TraversalEngine {
|
||||
/** our log, which we want to capture anything from this class */
|
||||
protected static Logger logger = Logger.getLogger(TraverseDuplicates.class);
|
||||
|
||||
private final boolean DEBUG = false;
|
||||
|
||||
/**
|
||||
* Creates a new, uninitialized TraversalEngine
|
||||
*
|
||||
* @param reads SAM/BAM file of reads
|
||||
* @param ref Reference file in FASTA format, assumes a .dict file is also available
|
||||
* @param rods Array of reference ordered data sets
|
||||
*/
|
||||
public TraverseDuplicates(List<File> reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
||||
super(reads, ref, rods);
|
||||
}
|
||||
|
||||
private List<SAMRecord> readsAtLoc(final SAMRecord read, PushbackIterator<SAMRecord> iter)
|
||||
{
|
||||
GenomeLoc site = new GenomeLoc(read);
|
||||
ArrayList<SAMRecord> l = new ArrayList<SAMRecord>();
|
||||
|
||||
l.add(read);
|
||||
for (SAMRecord read2: iter) {
|
||||
GenomeLoc site2 = new GenomeLoc(read2);
|
||||
|
||||
// the next read starts too late
|
||||
if ( site2.getStart() != site.getStart() ) {
|
||||
//System.out.printf("site = %s, site2 = %s%n", site, site2);
|
||||
iter.pushback(read2);
|
||||
break;
|
||||
} else {
|
||||
//System.out.printf("Read is a duplicate: %s%n", read.format());
|
||||
l.add(read2);
|
||||
}
|
||||
}
|
||||
|
||||
return l;
|
||||
}
|
||||
|
||||
private List<SAMRecord> collectDuplicates(List<SAMRecord> reads) {
|
||||
ArrayList<SAMRecord> dups = new ArrayList<SAMRecord>();
|
||||
|
||||
// find the first duplicate
|
||||
SAMRecord key = null;
|
||||
for ( SAMRecord read : reads ) {
|
||||
if ( read.getDuplicateReadFlag() ) {
|
||||
// this is our key
|
||||
key = read;
|
||||
if (DEBUG) logger.debug(String.format("Key %s is a duplicate", read.getReadName()));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// At this point, there are two possibilities, we have found at least one dup or not
|
||||
//System.out.printf("Key is %s%n", key);
|
||||
if ( key != null ) {
|
||||
final GenomeLoc keyLoc = new GenomeLoc(key);
|
||||
final GenomeLoc keyMateLoc = new GenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
|
||||
|
||||
for ( SAMRecord read : reads ) {
|
||||
final GenomeLoc readLoc = new GenomeLoc(read);
|
||||
final GenomeLoc readMateLoc = new GenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
|
||||
if (DEBUG) logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc));
|
||||
if ( readLoc.compareTo(keyLoc) == 0 && readMateLoc.compareTo(keyMateLoc) == 0 ) {
|
||||
// we are at the same position as the dup and have the same mat pos, it's a dup
|
||||
if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s%n", read));
|
||||
dups.add(read);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return dups;
|
||||
}
|
||||
|
||||
/**
|
||||
* Traverse by reads, given the data and the walker
|
||||
* @param walker the walker to execute over
|
||||
* @param shard the shard of data to feed the walker
|
||||
* @param locusProvider the factory for loci
|
||||
* @param sum of type T, the return from the walker
|
||||
* @param <M> the generic type
|
||||
* @param <T> the return type of the reduce function
|
||||
* @return
|
||||
*/
|
||||
public <M, T> T actuallyTraverse(DuplicateWalker<M, T> dupWalker,
|
||||
Iterator<SAMRecord> readIter,
|
||||
T sum) {
|
||||
// while we still have more reads
|
||||
// ok, here's the idea. We get all the reads that start at the same position in the genome
|
||||
// We then split the list of reads into sublists of reads:
|
||||
// -> those with the same mate pair position, for paired reads
|
||||
// -> those flagged as unpaired and duplicated but having the same start and end and
|
||||
PushbackIterator<SAMRecord> iter = new PushbackIterator<SAMRecord>(readIter);
|
||||
for (SAMRecord read: iter) {
|
||||
// get the genome loc from the read
|
||||
GenomeLoc site = new GenomeLoc(read);
|
||||
logger.debug(String.format("*** TraverseDuplicates.traverse at %s", site));
|
||||
List<SAMRecord> reads = readsAtLoc(read, iter);
|
||||
List<SAMRecord> duplicateReads = collectDuplicates(reads);
|
||||
|
||||
// Jump forward in the reference to this locus location
|
||||
LocusContext locus = new LocusContext(site, duplicateReads, Arrays.asList(0));
|
||||
|
||||
// update the number of duplicate sets we've seen
|
||||
TraversalStatistics.nRecords++;
|
||||
|
||||
// we still have to fix the locus context provider to take care of this problem with > 1 length contexts
|
||||
// LocusContext locus = locusProvider.getLocusContext(site);
|
||||
|
||||
byte[] refBases = new byte[0];
|
||||
|
||||
final boolean keepMeP = dupWalker.filter(site, refBases, locus, duplicateReads);
|
||||
if (keepMeP) {
|
||||
M x = dupWalker.map(site, refBases, locus, duplicateReads);
|
||||
sum = dupWalker.reduce(x, sum);
|
||||
}
|
||||
|
||||
printProgress("dups", site);
|
||||
|
||||
if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) {
|
||||
logger.warn(String.format(("Maximum number of duplicate sets encountered, terminating traversal " + TraversalStatistics.nRecords)));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Class to filter out un-handle-able reads from the stream. We currently are skipping
|
||||
* unmapped reads, non-primary reads, unaligned reads, and duplicate reads.
|
||||
*/
|
||||
public static class duplicateStreamFilterFunc implements SamRecordFilter {
|
||||
SAMRecord lastRead = null;
|
||||
public boolean filterOut(SAMRecord rec) {
|
||||
boolean result = false;
|
||||
String why = "";
|
||||
if (rec.getReadUnmappedFlag()) {
|
||||
TraversalStatistics.nUnmappedReads++;
|
||||
result = true;
|
||||
why = "Unmapped";
|
||||
} else if (rec.getNotPrimaryAlignmentFlag()) {
|
||||
TraversalStatistics.nNotPrimary++;
|
||||
result = true;
|
||||
why = "Not Primary";
|
||||
} else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) {
|
||||
TraversalStatistics.nBadAlignments++;
|
||||
result = true;
|
||||
why = "No alignment start";
|
||||
}
|
||||
else {
|
||||
result = false;
|
||||
}
|
||||
|
||||
if (result) {
|
||||
TraversalStatistics.nSkippedReads++;
|
||||
//System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why);
|
||||
} else {
|
||||
TraversalStatistics.nReads++;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// old style interface to the system
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
public <M,T> T traverse(Walker<M,T> walker, ArrayList<GenomeLoc> locations) {
|
||||
if ( walker instanceof DuplicateWalker) {
|
||||
Walker x = walker;
|
||||
DuplicateWalker<?, ?> dupWalker = (DuplicateWalker<?, ?>)x;
|
||||
return (T)this.traverseByRead(dupWalker, locations);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Walker isn't a duplicate walker!");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Should we deleted at the soonist possible opportunity
|
||||
*/
|
||||
public <M, T> Object traverseByRead(DuplicateWalker<M, T> walker, ArrayList<GenomeLoc> locations) {
|
||||
samReadIter = initializeReads();
|
||||
|
||||
// Initialize the walker
|
||||
walker.initialize();
|
||||
|
||||
// Initialize the sum
|
||||
FilteringIterator filterIter = new FilteringIterator(samReadIter, new duplicateStreamFilterFunc());
|
||||
T sum = actuallyTraverse(walker, filterIter, walker.reduceInit());
|
||||
|
||||
//printOnTraversalDone("reads", sum);
|
||||
walker.onTraversalDone(sum);
|
||||
return sum;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// new style interface to the system
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
/**
|
||||
* Traverse by reads, given the data and the walker
|
||||
* @param walker the walker to execute over
|
||||
* @param shard the shard of data to feed the walker
|
||||
* @param locusProvider the factory for loci
|
||||
* @param sum of type T, the return from the walker
|
||||
* @param <M> the generic type
|
||||
* @param <T> the return type of the reduce function
|
||||
* @return
|
||||
*/
|
||||
public <M, T> T traverse(Walker<M, T> walker,
|
||||
Shard shard,
|
||||
LocusContextProvider locusProvider,
|
||||
BoundedReadIterator readIter,
|
||||
T sum) {
|
||||
|
||||
logger.debug(String.format("TraverseDuplicates.traverse Genomic interval is %s", ((ReadShard)shard).getSize()));
|
||||
|
||||
if (!(walker instanceof DuplicateWalker))
|
||||
throw new IllegalArgumentException("Walker isn't a duplicate walker!");
|
||||
|
||||
DuplicateWalker<M, T> dupWalker = (DuplicateWalker<M, T>) walker;
|
||||
|
||||
// while we still have more reads
|
||||
// ok, here's the idea. We get all the reads that start at the same position in the genome
|
||||
// We then split the list of reads into sublists of reads:
|
||||
// -> those with the same mate pair position, for paired reads
|
||||
// -> those flagged as unpaired and duplicated but having the same start and end and
|
||||
|
||||
FilteringIterator filterIter = new FilteringIterator(readIter, new duplicateStreamFilterFunc());
|
||||
PushbackIterator<SAMRecord> iter = new PushbackIterator<SAMRecord>(filterIter);
|
||||
return actuallyTraverse(dupWalker, readIter, sum);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,38 @@
|
|||
package org.broadinstitute.sting.gatk.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mdepristo
|
||||
* Date: Feb 22, 2009
|
||||
* Time: 2:52:28 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public abstract class DuplicateWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
|
||||
// Do we actually want to operate on the context?
|
||||
public boolean filter(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> duplicateReads) {
|
||||
return true; // We are keeping all the reads
|
||||
}
|
||||
|
||||
/**
|
||||
* These two functions state whether we're don't make any sense without reads (requiresRead())
|
||||
* or whether we can't take any reads at all (cannotHandleRead())
|
||||
*/
|
||||
public boolean requiresReads() { return true; }
|
||||
public boolean cannotHandleReads() { return false; }
|
||||
|
||||
// Map over the org.broadinstitute.sting.gatk.LocusContext
|
||||
public abstract MapType map(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> duplicateReads);
|
||||
|
||||
// Given result of map function
|
||||
public abstract ReduceType reduceInit();
|
||||
public abstract ReduceType reduce(MapType value, ReduceType sum);
|
||||
}
|
||||
|
|
@ -0,0 +1,171 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.io.PrintStream;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
class MismatchCounter {
|
||||
long nObs = 0;
|
||||
long nMismatches = 0;
|
||||
|
||||
public void inc(long incNObs, long incNMismatches) {
|
||||
nObs += incNObs;
|
||||
nMismatches += incNMismatches;
|
||||
}
|
||||
|
||||
public void inc(boolean mismatchP) {
|
||||
inc(1, mismatchP ? 1 : 0);
|
||||
}
|
||||
|
||||
|
||||
public double mismatchRate() {
|
||||
return (double)nMismatches / nObs;
|
||||
}
|
||||
|
||||
public byte empiricalQualScore() {
|
||||
return QualityUtils.probToQual(1 - mismatchRate(), 0);
|
||||
}
|
||||
|
||||
public String headerString() {
|
||||
return "mismatchRate\tempiricalQ\tnObs\tnMismatches";
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%.10f\t%d\t%d\t%6d", mismatchRate(), empiricalQualScore(), nObs, nMismatches);
|
||||
}
|
||||
}
|
||||
|
||||
class QualityTracker {
|
||||
final private int MAX_QUAL_SCORE = 50;
|
||||
MismatchCounter[][] mismatchesByQ = new MismatchCounter[MAX_QUAL_SCORE][MAX_QUAL_SCORE];
|
||||
|
||||
public QualityTracker() {
|
||||
for ( int i = 0; i < MAX_QUAL_SCORE; i++ ) {
|
||||
for ( int j = 0; j < MAX_QUAL_SCORE; j++ ) {
|
||||
mismatchesByQ[i][j] = new MismatchCounter();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void inc(int b1Q, int b2Q, boolean mismatchP) {
|
||||
if ( b1Q > MAX_QUAL_SCORE ) throw new RuntimeException("Unexpectedly large base quality " + b1Q);
|
||||
if ( b2Q > MAX_QUAL_SCORE ) throw new RuntimeException("Unexpectedly large base quality " + b2Q);
|
||||
mismatchesByQ[b1Q][b2Q].inc(mismatchP);
|
||||
}
|
||||
|
||||
public void inc(DuplicateComp dc) {
|
||||
inc(dc.qLarger, dc.qSmaller, dc.mismatchP);
|
||||
}
|
||||
|
||||
public void printToStream(PrintStream out, boolean filterUnobserved) {
|
||||
out.printf("Q1\tQ2\t%s%n", mismatchesByQ[0][0].headerString());
|
||||
for ( int i = 0; i < MAX_QUAL_SCORE; i++ ) {
|
||||
for ( int j = 0; j < MAX_QUAL_SCORE; j++ ) {
|
||||
MismatchCounter mc = mismatchesByQ[i][j];
|
||||
//System.out.printf("MC = %s%n", mc);
|
||||
if ( filterUnobserved && mc.nObs == 0 )
|
||||
continue;
|
||||
out.printf("%d\t%d\t%s\t%n", i, j, mc.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class DuplicateComp {
|
||||
int qLarger;
|
||||
int qSmaller;
|
||||
boolean mismatchP;
|
||||
|
||||
public DuplicateComp( int b1Q, int b2Q, boolean mismatchP ) {
|
||||
qLarger = Math.max(b1Q, b2Q);
|
||||
qSmaller = Math.min(b1Q, b2Q);
|
||||
this.mismatchP = mismatchP;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%d %d %b", qLarger, qSmaller, mismatchP);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mdepristo
|
||||
* Date: Feb 22, 2009
|
||||
* Time: 2:52:28 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, QualityTracker> {
|
||||
@Argument(fullName="filterUnobservedQuals", shortName="filterUnobservedQuals", required=false, defaultValue="false", doc="Show only quality bins with at least one observation in the data")
|
||||
public boolean FILTER_UNOBSERVED_QUALS;
|
||||
|
||||
@Argument(fullName="maxPairwiseCompsPerDupSet", shortName="maxPairwiseCompsPerDupSet", required=false, defaultValue="100", doc="Maximumize number of pairwise comparisons to perform among duplicate read sets")
|
||||
public int MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET;
|
||||
|
||||
final private boolean ACTUALLY_DO_WORK = true;
|
||||
|
||||
public void onTraversalDone(QualityTracker result) {
|
||||
result.printToStream(out, FILTER_UNOBSERVED_QUALS);
|
||||
}
|
||||
|
||||
public QualityTracker reduceInit() {
|
||||
return new QualityTracker();
|
||||
}
|
||||
|
||||
public QualityTracker reduce(List<DuplicateComp> dupComps, QualityTracker tracker) {
|
||||
for ( DuplicateComp dc : dupComps ) {
|
||||
tracker.inc(dc);
|
||||
}
|
||||
|
||||
return tracker;
|
||||
}
|
||||
|
||||
// Print out data for regression
|
||||
public List<DuplicateComp> map(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> duplicateReads) {
|
||||
//out.printf("%s has %d duplicates%n", loc, duplicateReads.size());
|
||||
List<DuplicateComp> comps = new ArrayList<DuplicateComp>();
|
||||
|
||||
if ( ! ACTUALLY_DO_WORK )
|
||||
return comps;
|
||||
|
||||
int nComparisons = 0;
|
||||
for ( SAMRecord read1 : duplicateReads ) {
|
||||
byte[] read1Bases = read1.getReadBases();
|
||||
byte[] read1Quals = read1.getBaseQualities();
|
||||
|
||||
for ( SAMRecord read2 : duplicateReads ) {
|
||||
if ( read1 != read2 && read1.getReadLength() == read2.getReadLength()) {
|
||||
byte[] read2Bases = read2.getReadBases();
|
||||
byte[] read2Quals = read2.getBaseQualities();
|
||||
nComparisons++;
|
||||
|
||||
for ( int i = 0; i < read1Bases.length; i++) {
|
||||
byte read1Q = read1Quals[i];
|
||||
byte read2Q = read2Quals[i];
|
||||
boolean mismatchP = read1Bases[i] != read2Bases[i];
|
||||
DuplicateComp dc = new DuplicateComp(read1Q, read2Q, mismatchP);
|
||||
|
||||
//logger.debug(String.format("dc: %s", dc));
|
||||
comps.add(dc);
|
||||
}
|
||||
|
||||
if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET )
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return comps;
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue