Added ability to call multiallelic indels, if -multiallelic is included in UG arguments. Simple idea: we genotype all alleles with count >= minIndelCnt.

To support this, refactored code that computes consensus alleles. To ease merging of mulitple alt alleles, we create a single vc for each alt alleles and then use VariantContextUtils.simpleMerge to carry out merging, which takes care of handling all corner conditions already. In order to use this, interface to GenotypeLikelihoodsCalculationModel changed to pass in a GenomeLocParser object (why are these objects to hard to handle??).
More testing is required and feature turned off my default.
This commit is contained in:
Guillermo del Angel 2012-01-06 11:24:38 -05:00
parent 43224ef364
commit d4e7655d14
5 changed files with 106 additions and 95 deletions

View File

@ -31,6 +31,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@ -72,25 +73,28 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
this.logger = logger;
}
/**
* Must be overridden by concrete subclasses
*
* @param tracker rod data
* @param ref reference context
* @param contexts stratified alignment contexts
* @param contextType stratified context type
* @param priors priors to use for GLs
* @param alternateAlleleToUse the alternate allele to use, null if not set
* @param useBAQedPileup should we use the BAQed pileup or the raw one?
* @return variant context where genotypes are no-called but with GLs
*/
public abstract VariantContext getLikelihoods(RefMetaDataTracker tracker,
ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType,
GenotypePriors priors,
Allele alternateAlleleToUse,
boolean useBAQedPileup);
/**
* Can be overridden by concrete subclasses
*
* @param tracker rod data
* @param ref reference context
* @param contexts stratified alignment contexts
* @param contextType stratified context type
* @param priors priors to use for GLs
* @param alternateAlleleToUse the alternate allele to use, null if not set
* @param useBAQedPileup should we use the BAQed pileup or the raw one?
* @param locParser Genome Loc Parser
* @return variant context where genotypes are no-called but with GLs
*/
public abstract VariantContext getLikelihoods(RefMetaDataTracker tracker,
ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType,
GenotypePriors priors,
Allele alternateAlleleToUse,
boolean useBAQedPileup,
GenomeLocParser locParser);
protected int getFilteredDepth(ReadBackedPileup pileup) {
int count = 0;

View File

@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
@ -54,17 +55,17 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
private final boolean getAlleleListFromVCF;
private boolean DEBUG = false;
private final boolean doMultiAllelicCalls;
private boolean ignoreSNPAllelesWhenGenotypingIndels = false;
private final int maxAlternateAlleles;
private PairHMMIndelErrorModel pairModel;
private static ThreadLocal<HashMap<PileupElement,LinkedHashMap<Allele,Double>>> indelLikelihoodMap =
new ThreadLocal<HashMap<PileupElement,LinkedHashMap<Allele,Double>>>() {
protected synchronized HashMap<PileupElement,LinkedHashMap<Allele,Double>> initialValue() {
return new HashMap<PileupElement,LinkedHashMap<Allele,Double>>();
}
};
protected synchronized HashMap<PileupElement,LinkedHashMap<Allele,Double>> initialValue() {
return new HashMap<PileupElement,LinkedHashMap<Allele,Double>>();
}
};
private LinkedHashMap<Allele,Haplotype> haplotypeMap;
@ -87,6 +88,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING;
HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE;
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
maxAlternateAlleles = UAC.MAX_ALTERNATE_ALLELES;
doMultiAllelicCalls = UAC.MULTI_ALLELIC;
haplotypeMap = new LinkedHashMap<Allele,Haplotype>();
ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES;
@ -95,7 +98,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
private ArrayList<Allele> computeConsensusAlleles(ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType) {
AlignmentContextUtils.ReadOrientation contextType, GenomeLocParser locParser) {
Allele refAllele=null, altAllele=null;
GenomeLoc loc = ref.getLocus();
ArrayList<Allele> aList = new ArrayList<Allele>();
@ -114,7 +117,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping)
return aList;
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
// todo -- warning, can be duplicating expensive partition here
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
@ -126,9 +129,9 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
for ( ExtendedEventPileupElement p : indelPileup.toExtendedIterable() ) {
//SAMRecord read = p.getRead();
GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
if (read == null)
continue;
continue;
if(ReadUtils.is454Read(read)) {
continue;
}
@ -208,63 +211,69 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
}
}
/* if (DEBUG) {
int icount = indelPileup.getNumberOfInsertions();
int dcount = indelPileup.getNumberOfDeletions();
if (icount + dcount > 0)
{
List<Pair<String,Integer>> eventStrings = indelPileup.getEventStringsWithCounts(ref.getBases());
System.out.format("#ins: %d, #del:%d\n", insCount, delCount);
for (int i=0 ; i < eventStrings.size() ; i++ ) {
System.out.format("%s:%d,",eventStrings.get(i).first,eventStrings.get(i).second);
// int k=0;
}
System.out.println();
}
} */
}
Collection<VariantContext> vcs = new ArrayList<VariantContext>();
int maxAlleleCnt = 0;
String bestAltAllele = "";
for (String s : consensusIndelStrings.keySet()) {
int curCnt = consensusIndelStrings.get(s);
if (curCnt > maxAlleleCnt) {
maxAlleleCnt = curCnt;
bestAltAllele = s;
int curCnt = consensusIndelStrings.get(s), stop = 0;
// if observed count if above minimum threshold, we will genotype this allele
if (curCnt < minIndelCountForGenotyping)
continue;
if (s.startsWith("D")) {
// get deletion length
int dLen = Integer.valueOf(s.substring(1));
// get ref bases of accurate deletion
int startIdxInReference = 1+loc.getStart()-ref.getWindow().getStart();
stop = loc.getStart() + dLen;
byte[] refBases = Arrays.copyOfRange(ref.getBases(),startIdxInReference,startIdxInReference+dLen);
if (Allele.acceptableAlleleBases(refBases)) {
refAllele = Allele.create(refBases,true);
altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false);
}
}
else {
// insertion case
if (Allele.acceptableAlleleBases(s)) {
refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true);
altAllele = Allele.create(s, false);
stop = loc.getStart();
}
}
// if (DEBUG)
// System.out.format("Key:%s, number: %d\n",s,consensusIndelStrings.get(s) );
} //gdebug-
if (maxAlleleCnt < minIndelCountForGenotyping)
return aList;
if (bestAltAllele.startsWith("D")) {
// get deletion length
int dLen = Integer.valueOf(bestAltAllele.substring(1));
// get ref bases of accurate deletion
int startIdxInReference = 1+loc.getStart()-ref.getWindow().getStart();
ArrayList vcAlleles = new ArrayList<Allele>();
vcAlleles.add(refAllele);
vcAlleles.add(altAllele);
//System.out.println(new String(ref.getBases()));
byte[] refBases = Arrays.copyOfRange(ref.getBases(),startIdxInReference,startIdxInReference+dLen);
final VariantContextBuilder builder = new VariantContextBuilder().source("");
builder.loc(loc.getContig(), loc.getStart(), stop);
builder.alleles(vcAlleles);
builder.referenceBaseForIndel(ref.getBase());
builder.noGenotypes();
if (doMultiAllelicCalls)
vcs.add(builder.make());
else {
if (curCnt > maxAlleleCnt) {
maxAlleleCnt = curCnt;
vcs.clear();
vcs.add(builder.make());
}
if (Allele.acceptableAlleleBases(refBases)) {
refAllele = Allele.create(refBases,true);
altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false);
}
}
else {
// insertion case
if (Allele.acceptableAlleleBases(bestAltAllele)) {
refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true);
altAllele = Allele.create(bestAltAllele, false);
}
}
if (refAllele != null && altAllele != null) {
aList.add(0,refAllele);
aList.add(1,altAllele);
}
if (vcs.isEmpty())
return aList; // nothing else to do, no alleles passed minimum count criterion
VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false);
aList = new ArrayList<Allele>(mergedVC.getAlleles());
return aList;
}
@ -277,7 +286,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
AlignmentContextUtils.ReadOrientation contextType,
GenotypePriors priors,
Allele alternateAlleleToUse,
boolean useBAQedPileup) {
boolean useBAQedPileup, GenomeLocParser locParser) {
if ( tracker == null )
return null;
@ -294,17 +303,17 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
haplotypeMap.clear();
if (getAlleleListFromVCF) {
for( final VariantContext vc_input : tracker.getValues(UAC.alleles, loc) ) {
if( vc_input != null &&
allowableTypes.contains(vc_input.getType()) &&
ref.getLocus().getStart() == vc_input.getStart()) {
vc = vc_input;
break;
}
}
// ignore places where we don't have a variant
if ( vc == null )
return null;
for( final VariantContext vc_input : tracker.getValues(UAC.alleles, loc) ) {
if( vc_input != null &&
allowableTypes.contains(vc_input.getType()) &&
ref.getLocus().getStart() == vc_input.getStart()) {
vc = vc_input;
break;
}
}
// ignore places where we don't have a variant
if ( vc == null )
return null;
alleleList.clear();
if (ignoreSNPAllelesWhenGenotypingIndels) {
@ -323,7 +332,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
}
else {
alleleList = computeConsensusAlleles(ref,contexts, contextType);
alleleList = computeConsensusAlleles(ref,contexts, contextType, locParser);
if (alleleList.isEmpty())
return null;
}
@ -340,7 +349,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
if (alleleList.isEmpty())
return null;
refAllele = alleleList.get(0);
altAllele = alleleList.get(1);

View File

@ -30,10 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.StingException;
@ -66,7 +63,8 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
final AlignmentContextUtils.ReadOrientation contextType,
final GenotypePriors priors,
final Allele alternateAlleleToUse,
final boolean useBAQedPileup) {
final boolean useBAQedPileup,
final GenomeLocParser locParser) {
if ( !(priors instanceof DiploidSNPGenotypePriors) )
throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model");

View File

@ -109,7 +109,7 @@ public class UnifiedArgumentCollection {
* For advanced users only.
*/
@Advanced
@Argument(fullName = "multiallelic", shortName = "multiallelic", doc = "Allow the discovery of multiple alleles (SNPs only)", required = false)
@Argument(fullName = "multiallelic", shortName = "multiallelic", doc = "Allow the discovery of multiple alleles", required = false)
public boolean MULTI_ALLELIC = false;
/**

View File

@ -243,7 +243,7 @@ public class UnifiedGenotyperEngine {
glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC));
}
return glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine);
return glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser);
}
private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, AlignmentContext rawContext) {