First mostly fully functional implementation of banded pair HMM likelihood computation for indel caller. More experimentation to follow but it right now works in small data sets and at least it doesn't break existing things. Disabled by default at this point

This commit is contained in:
Guillermo del Angel 2011-09-28 15:51:48 -04:00
commit e2b9030e93
10 changed files with 329 additions and 848 deletions

View File

@ -19,8 +19,19 @@ import java.util.Map;
/**
* Total (unfiltered) depth over all samples.
*
* Affected by downsampling (-dcov) though, so the max value one can obtain for N samples with -dcov D
* is N * D
* This and AD are complementary fields that are two important ways of thinking about the depth of the data for this sample
* at this site. The DP field describe the total depth of reads that passed the Unified Genotypers internal
* quality control metrics (like MAPQ > 17, for example), whatever base was present in the read at this site.
* The AD values (one for each of REF and ALT fields) is the count of all reads that carried with them the
* REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the
* power I have to determine the genotype of the sample at this site, while the AD tells me how many times
* I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering
* the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like
* to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would
* normally be excluded from the statistical calculations going into GQ and QUAL.
*
* Note that the DP is affected by downsampling (-dcov) though, so the max value one can obtain for N samples with
* -dcov D is N * D
*/
public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation {

View File

@ -24,9 +24,9 @@ import java.util.Map;
/**
* The depth of coverage of each VCF allele in this sample
* The depth of coverage of each VCF allele in this sample.
*
* Complementary fields that two important ways of thinking about the depth of the data for this sample
* This and DP are complementary fields that are two important ways of thinking about the depth of the data for this sample
* at this site. The DP field describe the total depth of reads that passed the Unified Genotypers internal
* quality control metrics (like MAPQ > 17, for example), whatever base was present in the read at this site.
* The AD values (one for each of REF and ALT fields) is the count of all reads that carried with them the
@ -38,8 +38,8 @@ import java.util.Map;
* normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that
* the AD isn't necessarily calculated exactly for indels (it counts as non-reference only those indels that
* are actually present and correctly left-aligned in the alignments themselves). Because of this fact and
* because the AD includes reads and bases that were filtered by the Unified Genotyper, one should not base
* assumptions about the underlying genotype based on it; instead, the genotype likelihoods (PLs) are what
* because the AD includes reads and bases that were filtered by the Unified Genotyper, <b>one should not base
* assumptions about the underlying genotype based on it</b>; instead, the genotype likelihoods (PLs) are what
* determine the genotype calls (see below).
*/
public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation {

View File

@ -71,9 +71,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
// gdebug removeme
// todo -cleanup
private HaplotypeIndelErrorModel model;
private boolean useOldWrongHorribleHackedUpLikelihoodModel = false;
//
private GenomeLoc lastSiteVisited;
private ArrayList<Allele> alleleList;
@ -84,26 +81,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
super(UAC, logger);
if (UAC.GSA_PRODUCTION_ONLY == false) {
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.DO_CONTEXT_DEPENDENT_PENALTIES, UAC.dovit, UAC.GET_GAP_PENALTIES_FROM_DATA, UAC.INDEL_RECAL_FILE);
useOldWrongHorribleHackedUpLikelihoodModel = false;
}
else {
useOldWrongHorribleHackedUpLikelihoodModel = true;
double INSERTION_START_PROBABILITY = 1e-3;
double INSERTION_END_PROBABILITY = 0.5;
double ALPHA_DELETION_PROBABILITY = 1e-3;
model = new HaplotypeIndelErrorModel(3, INSERTION_START_PROBABILITY,
INSERTION_END_PROBABILITY,ALPHA_DELETION_PROBABILITY,UAC.INDEL_HAPLOTYPE_SIZE, false, UAC.OUTPUT_DEBUG_INDEL_INFO);
}
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.DO_CONTEXT_DEPENDENT_PENALTIES, UAC.dovit, UAC.GET_GAP_PENALTIES_FROM_DATA, UAC.INDEL_RECAL_FILE);
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.BANDED_INDEL_COMPUTATION);
alleleList = new ArrayList<Allele>();
getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING;
@ -122,10 +101,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
GenomeLoc loc = ref.getLocus();
ArrayList<Allele> aList = new ArrayList<Allele>();
if (DEBUG) {
System.out.println("'''''''''''''''''''''");
System.out.println("Loc:"+loc.toString());
}
HashMap<String,Integer> consensusIndelStrings = new HashMap<String,Integer>();
int insCount = 0, delCount = 0;
@ -159,12 +134,12 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
continue;
}
if (DEBUG && p.isIndel()) {
/* if (DEBUG && p.isIndel()) {
System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n",
read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(),
p.getEventLength(),p.getType().toString(), p.getEventBases());
}
*/
String indelString = p.getEventBases();
if (p.isInsertion()) {
@ -234,7 +209,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
}
}
if (DEBUG) {
/* if (DEBUG) {
int icount = indelPileup.getNumberOfInsertions();
int dcount = indelPileup.getNumberOfDeletions();
if (icount + dcount > 0)
@ -248,7 +223,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
}
System.out.println();
}
}
} */
}
int maxAlleleCnt = 0;
@ -259,8 +234,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
maxAlleleCnt = curCnt;
bestAltAllele = s;
}
if (DEBUG)
System.out.format("Key:%s, number: %d\n",s,consensusIndelStrings.get(s) );
// if (DEBUG)
// System.out.format("Key:%s, number: %d\n",s,consensusIndelStrings.get(s) );
} //gdebug-
if (maxAlleleCnt < minIndelCountForGenotyping)
@ -383,18 +358,11 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
}
}
}
int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length();
int hsize = (int)ref.getWindow().size()-Math.abs(eventLength)-1;
int numPrefBases= ref.getLocus().getStart()-ref.getWindow().getStart()+1;
if (useOldWrongHorribleHackedUpLikelihoodModel) {
numPrefBases = 20;
hsize=80;
}
if (DEBUG)
System.out.format("hsize: %d eventLength: %d refSize: %d, locStart: %d numpr: %d\n",hsize,eventLength,
(int)ref.getWindow().size(), loc.getStart(), numPrefBases);
//System.out.println(eventLength);
final int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length();
final int hsize = (int)ref.getWindow().size()-Math.abs(eventLength)-1;
final int numPrefBases= ref.getLocus().getStart()-ref.getWindow().getStart()+1;
haplotypeMap = Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(),
ref, hsize, numPrefBases);
@ -413,13 +381,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
pileup = context.getBasePileup();
if (pileup != null ) {
double[] genotypeLikelihoods;
if (useOldWrongHorribleHackedUpLikelihoodModel)
genotypeLikelihoods = model.computeReadHaplotypeLikelihoods( pileup, haplotypeMap);
else
genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
alleleList,

View File

@ -143,31 +143,24 @@ public class UnifiedArgumentCollection {
@Hidden
@Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false)
public int INDEL_HAPLOTYPE_SIZE = 80;
@Hidden
@Argument(fullName = "doContextDependentGapPenalties", shortName = "doCDP", doc = "Vary gap penalties by context", required = false)
public boolean DO_CONTEXT_DEPENDENT_PENALTIES = true;
//gdebug+
// experimental arguments, NOT TO BE USED BY ANYONE WHOSE INITIALS AREN'T GDA!!!
// @Hidden
// @Argument(fullName = "getGapPenaltiesFromData", shortName = "dataGP", doc = "Vary gap penalties by context - EXPERIMENTAL, DO NO USE", required = false)
// public boolean GET_GAP_PENALTIES_FROM_DATA = false;
//
// @Hidden
// @Argument(fullName="indel_recal_file", shortName="recalFile", required=false, doc="Filename for the input covariates table recalibration .csv file - EXPERIMENTAL, DO NO USE")
// public File INDEL_RECAL_FILE = new File("indel.recal_data.csv");
@Hidden
@Argument(fullName = "getGapPenaltiesFromData", shortName = "dataGP", doc = "Vary gap penalties by context - EXPERIMENTAL, DO NO USE", required = false)
public boolean GET_GAP_PENALTIES_FROM_DATA = false;
@Hidden
@Argument(fullName="indel_recal_file", shortName="recalFile", required=false, doc="Filename for the input covariates table recalibration .csv file - EXPERIMENTAL, DO NO USE")
public File INDEL_RECAL_FILE = new File("indel.recal_data.csv");
@Argument(fullName = "bandedIndel", shortName = "bandedIndel", doc = "Banded Indel likelihood computation", required = false)
public boolean BANDED_INDEL_COMPUTATION = false;
@Hidden
@Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false)
public boolean OUTPUT_DEBUG_INDEL_INFO = false;
@Hidden
@Argument(fullName = "dovit", shortName = "dovit", doc = "Perform full Viterbi calculation when evaluating the HMM", required = false)
public boolean dovit = false;
@Hidden
@Argument(fullName = "GSA_PRODUCTION_ONLY", shortName = "GSA_PRODUCTION_ONLY", doc = "don't ever use me", required = false)
public boolean GSA_PRODUCTION_ONLY = false;
@Hidden
@Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false)
public boolean IGNORE_SNP_ALLELES = false;
@ -204,17 +197,12 @@ public class UnifiedArgumentCollection {
uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY;
uac.OUTPUT_DEBUG_INDEL_INFO = OUTPUT_DEBUG_INDEL_INFO;
uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE;
uac.DO_CONTEXT_DEPENDENT_PENALTIES = DO_CONTEXT_DEPENDENT_PENALTIES;
uac.alleles = alleles;
uac.GET_GAP_PENALTIES_FROM_DATA = GET_GAP_PENALTIES_FROM_DATA;
uac.INDEL_RECAL_FILE = INDEL_RECAL_FILE;
// todo- arguments to remove
uac.COVERAGE_AT_WHICH_TO_ABORT = COVERAGE_AT_WHICH_TO_ABORT;
uac.dovit = dovit;
uac.GSA_PRODUCTION_ONLY = GSA_PRODUCTION_ONLY;
uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES;
uac.BANDED_INDEL_COMPUTATION = BANDED_INDEL_COMPUTATION;
return uac;
}

View File

@ -81,20 +81,17 @@ public class PileupElement {
//
// --------------------------------------------------------------------------
private Integer getReducedReadQualityTagValue() {
return getRead().getIntegerAttribute(ReadUtils.REDUCED_READ_QUALITY_TAG);
}
public boolean isReducedRead() {
return getReducedReadQualityTagValue() != null;
return ReadUtils.isReducedRead(getRead());
}
public int getReducedCount() {
if ( ! isReducedRead() ) throw new IllegalArgumentException("Cannot get reduced count for non-reduced read " + getRead().getReadName());
return (int)getQual();
}
public byte getReducedQual() {
return (byte)(int)getReducedReadQualityTagValue();
return (byte)(int)ReadUtils.getReducedReadQualityTagValue(getRead());
}
}

View File

@ -43,10 +43,43 @@ import java.util.*;
* @version 0.1
*/
public class ReadUtils {
public static final String REDUCED_READ_QUALITY_TAG = "RQ";
private ReadUtils() { }
// ----------------------------------------------------------------------------------------------------
//
// Reduced read utilities
//
// ----------------------------------------------------------------------------------------------------
public static final String REDUCED_READ_QUALITY_TAG = "RQ";
public final static Integer getReducedReadQualityTagValue(final SAMRecord read) {
return read.getIntegerAttribute(ReadUtils.REDUCED_READ_QUALITY_TAG);
}
public final static boolean isReducedRead(final SAMRecord read) {
return getReducedReadQualityTagValue(read) != null;
}
public final static SAMRecord reducedReadWithReducedQuals(final SAMRecord read) {
if ( ! isReducedRead(read) ) throw new IllegalArgumentException("read must be a reduced read");
try {
SAMRecord newRead = (SAMRecord)read.clone();
byte reducedQual = (byte)(int)getReducedReadQualityTagValue(read);
byte[] newQuals = new byte[read.getBaseQualities().length];
Arrays.fill(newQuals, reducedQual);
newRead.setBaseQualities(newQuals);
return newRead;
} catch ( CloneNotSupportedException e ) {
throw new ReviewedStingException("SAMRecord no longer supports clone", e);
}
}
// ----------------------------------------------------------------------------------------------------
//
// General utilities
//
// ----------------------------------------------------------------------------------------------------
public static SAMFileHeader copySAMFileHeader(SAMFileHeader toCopy) {
SAMFileHeader copy = new SAMFileHeader();
@ -157,7 +190,7 @@ public class ReadUtils {
* |----------------| (interval)
* <--------> (read)
*/
public enum ReadAndIntervalOverlap {NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED}
public enum ReadAndIntervalOverlap {NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED}
/**
* God, there's a huge information asymmetry in SAM format:
@ -640,27 +673,35 @@ public class ReadUtils {
*/
public static ReadAndIntervalOverlap getReadAndIntervalOverlapType(SAMRecord read, GenomeLoc interval) {
int start = getRefCoordSoftUnclippedStart(read);
int stop = getRefCoordSoftUnclippedEnd(read);
int sStart = getRefCoordSoftUnclippedStart(read);
int sStop = getRefCoordSoftUnclippedEnd(read);
int uStart = read.getUnclippedStart();
int uStop = read.getUnclippedEnd();
if ( !read.getReferenceName().equals(interval.getContig()) )
return ReadAndIntervalOverlap.NO_OVERLAP_CONTIG;
else if ( stop < interval.getStart() )
else if ( uStop < interval.getStart() )
return ReadAndIntervalOverlap.NO_OVERLAP_LEFT;
else if ( start > interval.getStop() )
else if ( uStart > interval.getStop() )
return ReadAndIntervalOverlap.NO_OVERLAP_RIGHT;
else if ( (start >= interval.getStart()) &&
(stop <= interval.getStop()) )
else if ( sStop < interval.getStart() )
return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_LEFT;
else if ( sStart > interval.getStop() )
return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_RIGHT;
else if ( (sStart >= interval.getStart()) &&
(sStop <= interval.getStop()) )
return ReadAndIntervalOverlap.OVERLAP_CONTAINED;
else if ( (start < interval.getStart()) &&
(stop > interval.getStop()) )
else if ( (sStart < interval.getStart()) &&
(sStop > interval.getStop()) )
return ReadAndIntervalOverlap.OVERLAP_LEFT_AND_RIGHT;
else if ( (start < interval.getStart()) )
else if ( (sStart < interval.getStart()) )
return ReadAndIntervalOverlap.OVERLAP_LEFT;
else

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.testng.Assert;
@ -12,9 +13,10 @@ import org.testng.annotations.Test;
public class ReadUtilsUnitTest extends BaseTest {
SAMRecord read;
SAMRecord read, reducedRead;
final static String BASES = "ACTG";
final static String QUALS = "!+5?";
final private static int REDUCED_READ_QUAL = 20;
@BeforeTest
public void init() {
@ -23,6 +25,11 @@ public class ReadUtilsUnitTest extends BaseTest {
read.setReadUnmappedFlag(true);
read.setReadBases(new String(BASES).getBytes());
read.setBaseQualityString(new String(QUALS));
reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length());
reducedRead.setReadBases(BASES.getBytes());
reducedRead.setBaseQualityString(QUALS);
reducedRead.setAttribute(ReadUtils.REDUCED_READ_QUALITY_TAG, REDUCED_READ_QUAL);
}
private void testReadBasesAndQuals(SAMRecord read, int expectedStart, int expectedStop) {
@ -38,4 +45,40 @@ public class ReadUtilsUnitTest extends BaseTest {
@Test public void testClip2Front() { testReadBasesAndQuals(read, 2, 4); }
@Test public void testClip1Back() { testReadBasesAndQuals(read, 0, 3); }
@Test public void testClip2Back() { testReadBasesAndQuals(read, 0, 2); }
@Test
public void testReducedReads() {
Assert.assertFalse(ReadUtils.isReducedRead(read), "isReducedRead is false for normal read");
Assert.assertEquals(ReadUtils.getReducedReadQualityTagValue(read), null, "No reduced read tag in normal read");
Assert.assertTrue(ReadUtils.isReducedRead(reducedRead), "isReducedRead is true for reduced read");
Assert.assertEquals((int) ReadUtils.getReducedReadQualityTagValue(reducedRead), REDUCED_READ_QUAL, "Reduced read tag is set to expected value");
}
@Test
public void testreducedReadWithReducedQualsWithReducedRead() {
SAMRecord replacedRead = ReadUtils.reducedReadWithReducedQuals(reducedRead);
Assert.assertEquals(replacedRead.getReadBases(), reducedRead.getReadBases());
Assert.assertEquals(replacedRead.getBaseQualities().length, reducedRead.getBaseQualities().length);
for ( int i = 0; i < replacedRead.getBaseQualities().length; i++)
Assert.assertEquals(replacedRead.getBaseQualities()[i], REDUCED_READ_QUAL);
}
@Test(expectedExceptions = IllegalArgumentException.class)
public void testreducedReadWithReducedQualsWithNormalRead() {
ReadUtils.reducedReadWithReducedQuals(read);
}
@Test
public void testReducedReadPileupElement() {
PileupElement readp = new PileupElement(read,0);
PileupElement reducedreadp = new PileupElement(reducedRead,0);
Assert.assertFalse(readp.isReducedRead());
Assert.assertTrue(reducedreadp.isReducedRead());
Assert.assertEquals(reducedreadp.getReducedCount(), 0);
Assert.assertEquals(reducedreadp.getReducedQual(), REDUCED_READ_QUAL);
}
}

View File

@ -29,7 +29,7 @@
<!-- Core walkers -->
<package name="org.broadinstitute.sting.gatk.walkers.**" />
<!-- All non-oneoff GATK-specific RODs -->
<package name="org.broadinstitute.sting.gatk.refdata.**" />
<package name="org.broadinstitute.sting.utils.codecs.**" />
<!-- Filters -->
<package name="org.broadinstitute.sting.gatk.filters" />
<!-- Tribble codecs -->

View File

@ -37,7 +37,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException
/**
* Entry point of Queue. Compiles and runs QScripts passed in to the command line.
*/
object QCommandLine {
object QCommandLine extends Logging {
/**
* Main.
* @param argv Arguments.
@ -45,22 +45,23 @@ object QCommandLine {
def main(argv: Array[String]) {
val qCommandLine = new QCommandLine
Runtime.getRuntime.addShutdownHook(new Thread {
/** Cleanup as the JVM shuts down. */
val shutdownHook = new Thread {
override def run() {
logger.info("Shutting down jobs. Please wait...")
ProcessController.shutdown()
qCommandLine.shutdown()
}
})
}
Runtime.getRuntime.addShutdownHook(shutdownHook)
try {
CommandLineProgram.start(qCommandLine, argv);
Runtime.getRuntime.removeShutdownHook(shutdownHook)
if (CommandLineProgram.result != 0)
System.exit(CommandLineProgram.result);
} catch {
case e: Exception => CommandLineProgram.exitSystemWithError(e)
} finally {
}
}
}