Better query start / stop function that directly parses the cigar string, unlike the previous version. Now properly handles H (hard-clipped) reads. Added -baq OFF and -baq RECALCULATE integration tests on all three 1KG technologies. Please let me know if this new code somehow fails.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5108 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
9cb1ae384c
commit
2182b8c7e2
|
|
@ -1,10 +1,12 @@
|
|||
package org.broadinstitute.sting.utils.baq;
|
||||
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
|
|
@ -159,8 +161,6 @@ public class BAQ {
|
|||
// NOTE -- THIS CODE IS SYNCHRONIZED WITH CODE IN THE SAMTOOLS REPOSITORY. CHANGES TO THIS CODE SHOULD BE
|
||||
// NOTE -- PUSHED BACK TO HENG LI
|
||||
//
|
||||
// Note that _ref and _query are in the special 0-4 encoding [see above for docs]
|
||||
//
|
||||
// ####################################################################################################
|
||||
public int hmm_glocal(final byte[] ref, final byte[] query, int qstart, int l_query, final byte[] _iqual, int[] state, byte[] q) {
|
||||
if ( ref == null ) throw new ReviewedStingException("BUG: ref sequence is null");
|
||||
|
|
@ -491,6 +491,10 @@ public class BAQ {
|
|||
|
||||
public static int MAG = 1; // todo -- remove me for performance testing only
|
||||
public BAQCalculationResult calcBAQFromHMM(byte[] ref, byte[] query, byte[] quals, int queryStart, int queryEnd ) {
|
||||
if ( queryStart < 0 ) throw new ReviewedStingException("BUG: queryStart < 0: " + queryStart);
|
||||
if ( queryEnd < 0 ) throw new ReviewedStingException("BUG: queryEnd < 0: " + queryEnd);
|
||||
if ( queryEnd < queryStart ) throw new ReviewedStingException("BUG: queryStart < queryEnd : " + queryStart + " end =" + queryEnd);
|
||||
|
||||
// note -- assumes ref is offset from the *CLIPPED* start
|
||||
BAQCalculationResult baqResult = new BAQCalculationResult(query, quals, ref);
|
||||
int queryLen = queryEnd - queryStart;
|
||||
|
|
@ -499,11 +503,65 @@ public class BAQ {
|
|||
return baqResult;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Determine the appropriate start and stop offsets in the reads for the bases given the cigar string
|
||||
* @param read
|
||||
* @return
|
||||
*/
|
||||
private final Pair<Integer,Integer> calculateQueryRange(SAMRecord read) {
|
||||
return calculateQueryRangeCigar(read);
|
||||
// comparison of old and new way of doing it
|
||||
// Pair<Integer, Integer> cigarResult = calculateQueryRangeCigar(read);
|
||||
// Pair<Integer, Integer> prevResult = calculateQueryRangeAlignment(read);
|
||||
//
|
||||
// if ( ! cigarResult.equals(prevResult) ) {
|
||||
// throw new ReviewedStingException("BUG: cigar and prev results differ at read " + read + ", aligment start " + read.getAlignmentStart());
|
||||
// }
|
||||
//
|
||||
// return true ? cigarResult : prevResult;
|
||||
}
|
||||
|
||||
private final Pair<Integer,Integer> calculateQueryRangeCigar(SAMRecord read) {
|
||||
int queryStart = -1, queryStop = -1;
|
||||
int readI = 0;
|
||||
|
||||
for ( CigarElement elt : read.getCigar().getCigarElements() ) {
|
||||
switch (elt.getOperator()) {
|
||||
case N: return null; // cannot handle these
|
||||
case H : case P : case D: break; // ignore pads, hard clips, and deletions
|
||||
case I : case S: case M:
|
||||
int prev = readI;
|
||||
readI += elt.getLength();
|
||||
if ( includeClippedBases || elt.getOperator() != CigarOperator.S) {
|
||||
if ( queryStart == -1 )
|
||||
queryStart = prev;
|
||||
queryStop = readI;
|
||||
}
|
||||
// in the else case we aren't including soft clipped bases, so we don't update
|
||||
// queryStart or queryStop
|
||||
break;
|
||||
default: throw new ReviewedStingException("BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName());
|
||||
}
|
||||
}
|
||||
return new Pair<Integer, Integer>(queryStart, queryStop);
|
||||
}
|
||||
|
||||
private final Pair<Integer,Integer> calculateQueryRangeAlignment(SAMRecord read) {
|
||||
int queryStart = includeClippedBases ? 0 : read.getAlignmentStart() - read.getUnclippedStart();
|
||||
int queryEnd = read.getReadLength() - (includeClippedBases ? 0 : read.getUnclippedEnd() - read.getAlignmentEnd());
|
||||
return new Pair<Integer, Integer>(queryStart, queryEnd);
|
||||
}
|
||||
|
||||
// we need to bad ref by at least the bandwidth / 2 on either side
|
||||
public BAQCalculationResult calcBAQFromHMM(SAMRecord read, byte[] ref, int refOffset) {
|
||||
// todo -- need to handle the case where the cigar sum of lengths doesn't cover the whole read
|
||||
int queryStart = includeClippedBases ? 0 : read.getAlignmentStart() - read.getUnclippedStart();
|
||||
int queryEnd = read.getReadLength() - (includeClippedBases ? 0 : read.getUnclippedEnd() - read.getAlignmentEnd());
|
||||
Pair<Integer, Integer> queryRange = calculateQueryRange(read);
|
||||
if ( queryRange == null ) return null; // read has Ns
|
||||
|
||||
int queryStart = queryRange.getFirst();
|
||||
int queryEnd = queryRange.getSecond();
|
||||
|
||||
BAQCalculationResult baqResult = calcBAQFromHMM(ref, read.getReadBases(), read.getBaseQualities(), queryStart, queryEnd);
|
||||
|
||||
// cap quals
|
||||
|
|
|
|||
|
|
@ -0,0 +1,30 @@
|
|||
package org.broadinstitute.sting.gatk.walkers;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class BAQIntegrationTest extends WalkerTest {
|
||||
private final static String baseCommand = "-T PrintReads -R " + b36KGReference +
|
||||
" -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" +
|
||||
" -o %s" +
|
||||
" -L 1:10,000,000-10,100,000";
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// testing BAQ with SLX, 454, and SOLID data
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
@Test
|
||||
public void testPrintReadsNoBAQ() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec( baseCommand +" -baq OFF", 1, Arrays.asList("902197bf77ed5a828d50e08771685928"));
|
||||
executeTest(String.format("testPrintReadsNoBAQ"), spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPrintReadsRecalBAQ() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec( baseCommand +" -baq RECALCULATE", 1, Arrays.asList("4ac691bde1ba1301a59857694fda6ae2"));
|
||||
executeTest(String.format("testPrintReadsRecalBAQ"), spec);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue