From 8925df2e1ef4618a6c22c8399a5f8f20a92b76ea Mon Sep 17 00:00:00 2001 From: depristo Date: Mon, 4 May 2009 21:51:01 +0000 Subject: [PATCH] More information from the duplicate combiner quality metrics git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@590 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/DuplicateQualsWalker.java | 165 +++++++++++++++--- 1 file changed, 139 insertions(+), 26 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/DuplicateQualsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/DuplicateQualsWalker.java index 1da9cab99..ad9bac07d 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/DuplicateQualsWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/DuplicateQualsWalker.java @@ -7,6 +7,8 @@ import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.gatk.walkers.DuplicateWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.cmdLine.Argument; import java.util.List; @@ -48,7 +50,7 @@ class MismatchCounter { } class QualityTracker { - final private int MAX_QUAL_SCORE = 50; + final private int MAX_QUAL_SCORE = 100; MismatchCounter[][] mismatchesByQ = new MismatchCounter[MAX_QUAL_SCORE][MAX_QUAL_SCORE]; public QualityTracker() { @@ -113,6 +115,10 @@ public class DuplicateQualsWalker extends DuplicateWalker, Q @Argument(fullName="maxPairwiseCompsPerDupSet", shortName="maxPairwiseCompsPerDupSet", required=false, defaultValue="100", doc="Maximumize number of pairwise comparisons to perform among duplicate read sets") public int MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET; + @Argument(fullName="combinedQuals", shortName="combinedQuals", required=false, defaultValue="false", doc="Combine and assess pairwise base qualities") + public boolean COMBINE_QUALS; + + final boolean DEBUG = false; final private boolean ACTUALLY_DO_WORK = true; public void onTraversalDone(QualityTracker result) { @@ -134,38 +140,145 @@ public class DuplicateQualsWalker extends DuplicateWalker, Q // Print out data for regression public List map(GenomeLoc loc, byte[] refBases, LocusContext context, List duplicateReads) { //out.printf("%s has %d duplicates%n", loc, duplicateReads.size()); - List comps = new ArrayList(); - + List pairwiseComps = new ArrayList(); + if ( ! ACTUALLY_DO_WORK ) - return comps; + return pairwiseComps; - int nComparisons = 0; - for ( SAMRecord read1 : duplicateReads ) { - byte[] read1Bases = read1.getReadBases(); - byte[] read1Quals = read1.getBaseQualities(); - - for ( SAMRecord read2 : duplicateReads ) { - if ( read1 != read2 && read1.getReadLength() == read2.getReadLength()) { - byte[] read2Bases = read2.getReadBases(); - byte[] read2Quals = read2.getBaseQualities(); - nComparisons++; - - for ( int i = 0; i < read1Bases.length; i++) { - byte read1Q = read1Quals[i]; - byte read2Q = read2Quals[i]; - boolean mismatchP = read1Bases[i] != read2Bases[i]; - DuplicateComp dc = new DuplicateComp(read1Q, read2Q, mismatchP); - - //logger.debug(String.format("dc: %s", dc)); - comps.add(dc); + if ( COMBINE_QUALS ) { + Pair combinedReads = combinedReadPair( duplicateReads ); + if ( combinedReads != null ) { + SAMRecord combined1 = combinedReads.first; + SAMRecord combined2 = combinedReads.second; + addPairwiseMatches( pairwiseComps, combined1, combined2 ); + } + } + else { + int nComparisons = 0; + for ( SAMRecord read1 : duplicateReads ) { + for ( SAMRecord read2 : duplicateReads ) { + if ( usableDuplicate(read1, read2) ) { + nComparisons++; + addPairwiseMatches( pairwiseComps, read1, read2 ); + if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET ) + break; } - - if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET ) - break; } } } + return pairwiseComps; + } + + private boolean usableDuplicate( SAMRecord read1, SAMRecord read2 ) { + return read1 != read2 && read1.getReadLength() == read2.getReadLength(); + } + + private List addPairwiseMatches(List comps, + SAMRecord read1, SAMRecord read2 ) { + byte[] read1Bases = read1.getReadBases(); + byte[] read1Quals = read1.getBaseQualities(); + byte[] read2Bases = read2.getReadBases(); + byte[] read2Quals = read2.getBaseQualities(); + + for ( int i = 0; i < read1Bases.length; i++) { + byte qual1 = read1Quals[i]; + byte qual2 = read2Quals[i]; + boolean mismatchP = read1Bases[i] != read2Bases[i]; + DuplicateComp dc = new DuplicateComp(qual1, qual2, mismatchP); + comps.add(dc); + } + return comps; } + + private Pair combinedReadPair( List duplicateReads ) { + if ( duplicateReads.size() < 4 ) + return null; + + SAMRecord c1 = combineDuplicates(duplicateReads.get(0),duplicateReads.get(1)); + SAMRecord c2 = combineDuplicates(duplicateReads.get(2),duplicateReads.get(3)); + return new Pair(c1, c2); + } + + private SAMRecord sample3rdRead( List duplicateReads, SAMRecord read1, SAMRecord read2 ) { + if ( duplicateReads.size() <= 2 ) { + // no third unique read is available + return null; + } else { + for ( SAMRecord read3 : duplicateReads ) { + if ( usableDuplicate(read1, read3) && usableDuplicate(read2, read3) ) + return read3; + } + + return null; + } + } + + private SAMRecord tmpCopyRead(SAMRecord read) { + SAMRecord copy = new SAMRecord(read.getHeader()); + copy.setReadName(read.getReadName()); + //copy.setReadString(final String value) { + copy.setReadBases(read.getReadBases()); + copy.setBaseQualities(read.getBaseQualities()); + copy.setReferenceName(read.getReferenceName()); + copy.setReferenceIndex(read.getReferenceIndex()); + copy.setMateReferenceName(read.getMateReferenceName()); + copy.setMateReferenceIndex(read.getMateReferenceIndex()); + copy.setAlignmentStart(read.getAlignmentStart()); + //copy.setAlignmentEnd(read.getAlignmentEnd()); + copy.setMateAlignmentStart(read.getMateAlignmentStart()); + copy.setInferredInsertSize(read.getInferredInsertSize()); + copy.setMappingQuality(read.getMappingQuality()); + copy.setCigar(read.getCigar()); + copy.setFlags(copy.getFlags()); + + return copy; + } + + private SAMRecord combineDuplicates(SAMRecord read1, SAMRecord read2) { + byte[] read1Bases = read1.getReadBases(); + byte[] read1Quals = read1.getBaseQualities(); + byte[] read2Bases = read2.getReadBases(); + byte[] read2Quals = read2.getBaseQualities(); + + byte[] bases = new byte[read1Bases.length]; + byte[] quals = new byte[read1Bases.length]; + + SAMRecord c = tmpCopyRead(read1); + for ( int i = 0; i < read1Bases.length; i++) { + byte base1 = read1Bases[i]; + byte base2 = read2Bases[i]; + byte qual1 = read1Quals[i]; + byte qual2 = read2Quals[i]; + final double p1 = QualityUtils.qualToProb(qual1); + final double p2 = QualityUtils.qualToProb(qual2); + + double pc; + byte basec; + + if ( base1 == base2 ) { + // agreement + basec = base1; + pc = 1 - (1 - p1) * (1 - p2); + } else { + // disagreement + basec = p1 > p2 ? base1 : base2; + pc = p1 > p2 ? p1 : p2; + //pc = 0; + } + + bases[i] = basec; + quals[i] = QualityUtils.probToQual(pc, 0.0); + + if ( DEBUG ) + logger.debug(String.format("Combining %s (Q%2d) with %s (Q%2d) -> %s (Q%2d)%s%n", + (char)base1, qual1, (char)base2, qual2, (char)bases[i], quals[i], + base1 == base2 ? "" : " [MISMATCH]")); + } + c.setReadBases(bases); + c.setBaseQualities(quals); + + return c; + } } \ No newline at end of file