Big UG refactoring and intermediate commit to support indels in pool caller (not done yet). Lots of code pulled out of long spaghetti-like functions and modularized to be easily shareable. Add functionality in ErrorModel to count indel matches/mismatches (but left part disabled as not to change integration tests in this commit), add computation of pool genotype likelihoods for indels (not fully working yet in more realistic cases, only working in artificial nice pools). Lot's of TBD's still but existing UG and pool SNP functionality should be intact

This commit is contained in:
Guillermo del Angel 2012-06-13 11:14:44 -04:00
parent bb77aa88c3
commit aee66ab157
5 changed files with 147 additions and 108 deletions

View File

@ -89,7 +89,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
* @param ref reference context
* @param contexts stratified alignment contexts
* @param contextType stratified context type
* @param alternateAllelesToUse the alternate allele to use, null if not set
* @param allAllelesToUse the alternate allele to use, null if not set
* @param useBAQedPileup should we use the BAQed pileup or the raw one?
* @param locParser Genome Loc Parser
* @return variant context where genotypes are no-called but with GLs
@ -98,7 +98,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
final ReferenceContext ref,
final Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType,
final List<Allele> alternateAllelesToUse,
final List<Allele> allAllelesToUse,
final boolean useBAQedPileup,
final GenomeLocParser locParser);

View File

@ -35,7 +35,6 @@ import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@ -44,9 +43,7 @@ import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
private final int HAPLOTYPE_SIZE;
private final boolean getAlleleListFromVCF;
private static final int HAPLOTYPE_SIZE = 80;
private boolean DEBUG = false;
private boolean ignoreSNPAllelesWhenGenotypingIndels = false;
@ -75,17 +72,15 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
super(UAC, logger);
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE;
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES;
}
protected List<Allele> computeConsensusAlleles(ReferenceContext ref,
protected static List<Allele> computeConsensusAlleles(ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType,
GenomeLocParser locParser) {
GenomeLocParser locParser, UnifiedArgumentCollection UAC) {
ConsensusAlleleCounter counter = new ConsensusAlleleCounter(locParser, true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE);
return counter.computeConsensusAlleles(ref, contexts, contextType);
}
@ -96,7 +91,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
final ReferenceContext ref,
final Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType,
final List<Allele> alternateAllelesToUse,
final List<Allele> allAllelesToUse,
final boolean useBAQedPileup,
final GenomeLocParser locParser) {
@ -104,95 +99,26 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
return null;
GenomeLoc loc = ref.getLocus();
Allele refAllele, altAllele;
VariantContext vc = null;
boolean allelesArePadded = true;
if (!ref.getLocus().equals(lastSiteVisited)) {
// starting a new site: clear allele list
alleleList.clear();
lastSiteVisited = ref.getLocus();
indelLikelihoodMap.set(new HashMap<PileupElement, LinkedHashMap<Allele, Double>>());
haplotypeMap.clear();
if (getAlleleListFromVCF) {
for (final VariantContext vc_input : tracker.getValues(UAC.alleles, loc)) {
if (vc_input != null &&
allowableTypes.contains(vc_input.getType()) &&
ref.getLocus().getStart() == vc_input.getStart()) {
vc = vc_input;
break;
}
}
// ignore places where we don't have a variant
if (vc == null)
return null;
alleleList.clear();
if (ignoreSNPAllelesWhenGenotypingIndels) {
// if there's an allele that has same length as the reference (i.e. a SNP or MNP), ignore it and don't genotype it
for (Allele a : vc.getAlleles())
if (a.isNonReference() && a.getBases().length == vc.getReference().getBases().length)
continue;
else
alleleList.add(a);
} else {
for (Allele a : vc.getAlleles())
alleleList.add(a);
}
if (vc.getReference().getBases().length == vc.getEnd()-vc.getStart()+1)
allelesArePadded = false;
} else {
alleleList = computeConsensusAlleles(ref, contexts, contextType, locParser);
if (alleleList.isEmpty())
return null;
}
alleleList = getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC, ignoreSNPAllelesWhenGenotypingIndels);
if (alleleList.isEmpty())
return null;
}
// protect against having an indel too close to the edge of a contig
if (loc.getStart() <= HAPLOTYPE_SIZE)
getHaplotypeMapFromAlleles(alleleList, ref, loc, haplotypeMap);
if (haplotypeMap == null || haplotypeMap.isEmpty())
return null;
// check if there is enough reference window to create haplotypes (can be an issue at end of contigs)
if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE)
return null;
if (alleleList.isEmpty())
return null;
refAllele = alleleList.get(0);
altAllele = alleleList.get(1);
// look for alt allele that has biggest length distance to ref allele
int maxLenDiff = 0;
for (Allele a : alleleList) {
if (a.isNonReference()) {
int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length());
if (lenDiff > maxLenDiff) {
maxLenDiff = lenDiff;
altAllele = a;
}
}
}
final int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length();
final int hsize = ref.getWindow().size() - Math.abs(eventLength) - 1;
final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1;
if (hsize <= 0) {
logger.warn(String.format("Warning: event at location %s can't be genotyped, skipping", loc.toString()));
return null;
}
haplotypeMap = Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(),
ref, hsize, numPrefBases);
// start making the VariantContext
// For all non-snp VC types, VC end location is just startLocation + length of ref allele including padding base.
int endLoc = loc.getStart() + refAllele.length()-1;
if (allelesArePadded)
endLoc++;
final int endLoc = computeEndLocation(alleleList, loc);
final int eventLength = getEventLength(alleleList);
final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleleList).referenceBaseForIndel(ref.getBase());
// create the genotypes; no-call everyone for now
@ -209,10 +135,10 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
if (context.hasBasePileup()) {
final ReadBackedPileup pileup = context.getBasePileup();
if (pileup != null) {
final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(genotypeLikelihoods);
final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
final GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(genotypeLikelihoods);
HashMap<String, Object> attributes = new HashMap<String, Object>();
final HashMap<String, Object> attributes = new HashMap<String, Object>();
attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup));
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods);
genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false));
@ -234,6 +160,103 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
return indelLikelihoodMap.get();
}
public static int computeEndLocation(final List<Allele> alleles, final GenomeLoc loc) {
Allele refAllele = alleles.get(0);
boolean allelesArePadded = true;
int endLoc = loc.getStart() + refAllele.length()-1;
/* if (refAllele.getBases().length == vc.getEnd()-vc.getStart()+1)
allelesArePadded = false;
*/
// todo FIXME!
if (allelesArePadded)
endLoc++;
return endLoc;
}
public static void getHaplotypeMapFromAlleles(final List<Allele> alleleList,
final ReferenceContext ref,
final GenomeLoc loc,
final LinkedHashMap<Allele, Haplotype> haplotypeMap) {
// protect against having an indel too close to the edge of a contig
if (loc.getStart() <= HAPLOTYPE_SIZE)
haplotypeMap.clear();
// check if there is enough reference window to create haplotypes (can be an issue at end of contigs)
else if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE)
haplotypeMap.clear();
else if (alleleList.isEmpty())
haplotypeMap.clear();
else {
final int eventLength = getEventLength(alleleList);
final int hsize = ref.getWindow().size() - Math.abs(eventLength) - 1;
final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1;
haplotypeMap.putAll(Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(),
ref, hsize, numPrefBases));
}
}
public static int getEventLength(List<Allele> alleleList) {
Allele refAllele = alleleList.get(0);
Allele altAllele = alleleList.get(1);
// look for alt allele that has biggest length distance to ref allele
int maxLenDiff = 0;
for (Allele a : alleleList) {
if (a.isNonReference()) {
int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length());
if (lenDiff > maxLenDiff) {
maxLenDiff = lenDiff;
altAllele = a;
}
}
}
return altAllele.getBaseString().length() - refAllele.getBaseString().length();
}
public static List<Allele> getInitialAlleleList(final RefMetaDataTracker tracker,
final ReferenceContext ref,
final Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType,
final GenomeLocParser locParser,
final UnifiedArgumentCollection UAC,
final boolean ignoreSNPAllelesWhenGenotypingIndels) {
List<Allele> alleles = new ArrayList<Allele>();
if (UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) {
VariantContext vc = null;
for (final VariantContext vc_input : tracker.getValues(UAC.alleles, ref.getLocus())) {
if (vc_input != null &&
allowableTypes.contains(vc_input.getType()) &&
ref.getLocus().getStart() == vc_input.getStart()) {
vc = vc_input;
break;
}
}
// ignore places where we don't have a variant
if (vc == null)
return alleles;
if (ignoreSNPAllelesWhenGenotypingIndels) {
// if there's an allele that has same length as the reference (i.e. a SNP or MNP), ignore it and don't genotype it
for (Allele a : vc.getAlleles())
if (a.isNonReference() && a.getBases().length == vc.getReference().getBases().length)
continue;
else
alleles.add(a);
} else {
alleles.addAll(vc.getAlleles());
}
} else {
alleles = IndelGenotypeLikelihoodsCalculationModel.computeConsensusAlleles(ref, contexts, contextType, locParser, UAC);
}
return alleles;
}
// Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we consider a deletion as part of the pileup,
// so that per-sample DP will include deletions covering the event.
protected int getFilteredDepth(ReadBackedPileup pileup) {

View File

@ -62,7 +62,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
final ReferenceContext ref,
final Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType,
final List<Allele> alternateAllelesToUse,
final List<Allele> allAllelesToUse,
final boolean useBAQedPileup,
final GenomeLocParser locParser) {
@ -70,11 +70,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase);
final Allele refAllele = Allele.create(refBase, true);
// start making the VariantContext
final GenomeLoc loc = ref.getLocus();
final List<Allele> alleles = new ArrayList<Allele>();
alleles.add(refAllele);
final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles);
// calculate the GLs
ArrayList<SampleGenotypeData> GLs = new ArrayList<SampleGenotypeData>(contexts.size());
@ -90,9 +85,16 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
GLs.add(new SampleGenotypeData(sample.getKey(), GL, getFilteredDepth(pileup)));
}
// start making the VariantContext
final GenomeLoc loc = ref.getLocus();
final List<Allele> alleles = new ArrayList<Allele>();
alleles.add(refAllele);
final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles);
// find the alternate allele(s) that we should be using
if ( alternateAllelesToUse != null ) {
alleles.addAll(alternateAllelesToUse);
if ( allAllelesToUse != null ) {
alleles.addAll(allAllelesToUse.subList(1,allAllelesToUse.size())); // this includes ref allele
} else if ( useAlleleFromVCF ) {
final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles);

View File

@ -378,10 +378,10 @@ public class UnifiedGenotyperEngine {
double overallLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
//if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
List<Allele> alternateAllelesToUse = builder.make().getAlternateAlleles();
List<Allele> allAllelesToUse = builder.make().getAlleles();
// the forward lod
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, alternateAllelesToUse, false, model);
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model);
AFresult.reset();
afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult);
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
@ -390,7 +390,7 @@ public class UnifiedGenotyperEngine {
//if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);
// the reverse lod
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, alternateAllelesToUse, false, model);
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model);
AFresult.reset();
afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult);
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);

View File

@ -117,7 +117,7 @@ public class PairHMMIndelErrorModel {
}
static private void getContextHomopolymerLength(final byte[] refBytes, final int[] hrunArray) {
static private void getContextHomopolymerLength(final byte[] refBytes, final int[] hrunArray) {
// compute forward hrun length, example:
// AGGTGACCCCCCTGAGAG
// 001000012345000000
@ -164,10 +164,24 @@ public class PairHMMIndelErrorModel {
}
}
}
public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap<Allele,Haplotype> haplotypeMap, ReferenceContext ref, int eventLength, HashMap<PileupElement, LinkedHashMap<Allele,Double>> indelLikelihoodMap){
public synchronized double[] computeDiploidReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap<Allele, Haplotype> haplotypeMap, ReferenceContext ref, int eventLength, HashMap<PileupElement, LinkedHashMap<Allele, Double>> indelLikelihoodMap){
final int numHaplotypes = haplotypeMap.size();
final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][numHaplotypes];
final int readCounts[] = new int[pileup.getNumberOfElements()];
final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, indelLikelihoodMap, readCounts);
return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
}
public synchronized double[][] computeGeneralReadHaplotypeLikelihoods(final ReadBackedPileup pileup,
final LinkedHashMap<Allele, Haplotype> haplotypeMap,
final ReferenceContext ref,
final int eventLength,
final HashMap<PileupElement, LinkedHashMap<Allele, Double>> indelLikelihoodMap,
final int[] readCounts) {
final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()];
final PairHMM pairHMM = new PairHMM(bandedLikelihoods);
int readIdx=0;
@ -367,7 +381,7 @@ public class PairHMMIndelErrorModel {
}
return getHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
return readLikelihoods;
}
private boolean useSoftClippedBases(GATKSAMRecord read, long eventStartPos, int eventLength) {
@ -385,7 +399,7 @@ public class PairHMMIndelErrorModel {
return b1.length;
}
private static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];
// todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplified to just a single loop without the intermediate NxN matrix